1 /*
2  * Copyright (c) 2015, The Android Open Source Project
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *   * Redistributions of source code must retain the above copyright
9  *     notice, this list of conditions and the following disclaimer.
10  *   * Redistributions in binary form must reproduce the above copyright
11  *     notice, this list of conditions and the following disclaimer
12  *     in the documentation and/or other materials provided with the
13  *     distribution.
14  *   * Neither the name of Google, Inc. nor the names of its contributors
15  *     may be used to endorse or promote products derived from this
16  *     software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22  * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25  * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
26  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
28  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <binder/IBinder.h>
33 #include <binder/IServiceManager.h>
34 #include <binder/Parcel.h>
35 
36 #include <cutils/properties.h>
37 #include <signal.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <ctime>
42 
43 #include <sys/resource.h>
44 #include <sys/stat.h>
45 #include <sys/time.h>
46 #include <sys/types.h>
47 #include <unistd.h>
48 
49 #include <utils/Log.h>
50 #include <utils/String8.h>
51 #include <utils/Trace.h>
52 #include <zlib.h>
53 
54 using namespace android;
55 
56 #ifdef LOG_TAG
57 #undef LOG_TAG
58 #endif
59 
60 #define LOG_TAG "anrdaemon"
61 
62 static const int check_period = 1;               // in sec
63 static const int tracing_check_period = 500000;  // in micro sec
64 static const int cpu_stat_entries = 7;           // number of cpu stat entries
65 static const int min_buffer_size = 16;
66 static const int max_buffer_size = 2048;
67 static const char* min_buffer_size_str = "16";
68 static const char* max_buffer_size_str = "2048";
69 static const int time_buf_size = 20;
70 static const int path_buf_size = 60;
71 
72 typedef struct cpu_stat {
73     unsigned long utime, ntime, stime, itime;
74     unsigned long iowtime, irqtime, sirqtime, steal;
75     unsigned long total;
76 } cpu_stat_t;
77 
78 /*
79  * Logging on/off threshold.
80  * Uint: 0.01%; default to 99.90% cpu.
81  */
82 static int idle_threshold = 10;
83 
84 static bool quit = false;
85 static bool suspend = false;
86 static bool dump_requested = false;
87 static bool err = false;
88 static char err_msg[100];
89 static bool tracing = false;
90 
91 static const char* buf_size_kb = "2048";
92 static const char* apps = "";
93 static uint64_t tag = 0;
94 
95 static cpu_stat_t new_cpu;
96 static cpu_stat_t old_cpu;
97 
98 /* Log certain kernel activity when enabled */
99 static bool log_sched = false;
100 static bool log_stack = false;
101 static bool log_irq = false;
102 static bool log_sync = false;
103 static bool log_workq = false;
104 
105 /* Paths for debugfs controls*/
106 static const char* dfs_trace_output_path = "/d/tracing/trace";
107 static const char* dfs_irq_path = "/d/tracing/events/irq/enable";
108 static const char* dfs_sync_path = "/d/tracing/events/sync/enable";
109 static const char* dfs_workq_path = "/d/tracing/events/workqueue/enable";
110 static const char* dfs_stack_path = "/d/tracing/options/stacktrace";
111 static const char* dfs_sched_switch_path = "/d/tracing/events/sched/sched_switch/enable";
112 static const char* dfs_sched_wakeup_path = "/d/tracing/events/sched/sched_wakeup/enable";
113 static const char* dfs_control_path = "/d/tracing/tracing_on";
114 static const char* dfs_buffer_size_path = "/d/tracing/buffer_size_kb";
115 static const char* dfs_tags_property = "debug.atrace.tags.enableflags";
116 static const char* dfs_apps_property = "debug.atrace.app_cmdlines";
117 
118 /*
119  * Read accumulated cpu data from /proc/stat
120  */
get_cpu_stat(cpu_stat_t * cpu)121 static void get_cpu_stat(cpu_stat_t* cpu) {
122     FILE* fp = NULL;
123     const char* params = "cpu  %lu %lu %lu %lu %lu %lu %lu %*d %*d %*d\n";
124 
125     if ((fp = fopen("/proc/stat", "r")) == NULL) {
126         err = true;
127         snprintf(err_msg, sizeof(err_msg), "can't read from /proc/stat with errno %d", errno);
128     } else {
129         if (fscanf(fp, params, &cpu->utime, &cpu->ntime, &cpu->stime, &cpu->itime, &cpu->iowtime,
130                    &cpu->irqtime, &cpu->sirqtime) != cpu_stat_entries) {
131             /*
132              * If failed in getting status, new_cpu won't be updated and
133              * is_heavy_loaded() will return false.
134              */
135             ALOGE("Error in getting cpu status. Skipping this check.");
136             fclose(fp);
137             return;
138         }
139 
140         cpu->total = cpu->utime + cpu->ntime + cpu->stime + cpu->itime + cpu->iowtime +
141                      cpu->irqtime + cpu->sirqtime;
142 
143         fclose(fp);
144     }
145 }
146 
147 /*
148  * Calculate cpu usage in the past interval.
149  * If tracing is on, increase the idle threshold by 1.00% so that we do not
150  * turn on and off tracing frequently when the cpu load is right close to
151  * threshold.
152  */
is_heavy_load(void)153 static bool is_heavy_load(void) {
154     unsigned long diff_idle, diff_total;
155     int threshold = idle_threshold + (tracing ? 100 : 0);
156     get_cpu_stat(&new_cpu);
157     diff_idle = new_cpu.itime - old_cpu.itime;
158     diff_total = new_cpu.total - old_cpu.total;
159     old_cpu = new_cpu;
160     return (diff_idle * 10000 < diff_total * threshold);
161 }
162 
163 /*
164  * Force the userland processes to refresh their property for logging.
165  */
dfs_poke_binder(void)166 static void dfs_poke_binder(void) {
167     sp<IServiceManager> sm = defaultServiceManager();
168     Vector<String16> services = sm->listServices();
169     for (size_t i = 0; i < services.size(); i++) {
170         sp<IBinder> obj = sm->checkService(services[i]);
171         if (obj != NULL) {
172             Parcel data;
173             obj->transact(IBinder::SYSPROPS_TRANSACTION, data, NULL, 0);
174         }
175     }
176 }
177 
178 /*
179  * Enable/disable a debugfs property by writing 0/1 to its path.
180  */
dfs_enable(bool enable,const char * path)181 static int dfs_enable(bool enable, const char* path) {
182     int fd = open(path, O_WRONLY);
183     if (fd == -1) {
184         err = true;
185         snprintf(err_msg, sizeof(err_msg), "Can't open %s. Error: %d", path, errno);
186         return -1;
187     }
188     const char* control = (enable ? "1" : "0");
189     ssize_t len = strlen(control);
190     int max_try = 10;  // Fail if write was interrupted for 10 times
191     while (write(fd, control, len) != len) {
192         if (errno == EINTR && max_try-- > 0) {
193             usleep(100);
194             continue;
195         }
196 
197         err = true;
198         snprintf(err_msg, sizeof(err_msg), "Error %d in writing to %s.", errno, path);
199     }
200     close(fd);
201     return (err ? -1 : 0);
202 }
203 
204 /*
205  * Set the userland tracing properties.
206  */
dfs_set_property(uint64_t mtag,const char * mapp,bool enable)207 static void dfs_set_property(uint64_t mtag, const char* mapp, bool enable) {
208     char buf[64];
209     snprintf(buf, sizeof(buf), "%#" PRIx64, mtag);
210     if (property_set(dfs_tags_property, buf) < 0) {
211         err = true;
212         snprintf(err_msg, sizeof(err_msg), "Failed to set debug tags system properties.");
213     }
214 
215     if (strlen(mapp) > 0 && property_set(dfs_apps_property, mapp) < 0) {
216         err = true;
217         snprintf(err_msg, sizeof(err_msg), "Failed to set debug applications.");
218     }
219 
220     if (log_sched) {
221         dfs_enable(enable, dfs_sched_switch_path);
222         dfs_enable(enable, dfs_sched_wakeup_path);
223     }
224     if (log_stack) {
225         dfs_enable(enable, dfs_stack_path);
226     }
227     if (log_irq) {
228         dfs_enable(enable, dfs_irq_path);
229     }
230     if (log_sync) {
231         dfs_enable(enable, dfs_sync_path);
232     }
233     if (log_workq) {
234         dfs_enable(enable, dfs_workq_path);
235     }
236 }
237 
238 /*
239  * Dump the log in a compressed format for systrace to visualize.
240  * Create a dump file "dump_of_anrdaemon.<current_time>" under /data/misc/anrd
241  */
dump_trace()242 static void dump_trace() {
243     time_t now = time(0);
244     struct tm tstruct;
245     char time_buf[time_buf_size];
246     char path_buf[path_buf_size];
247     const char* header = " done\nTRACE:\n";
248     ssize_t header_len = strlen(header);
249 
250     ALOGI("Started to dump ANRdaemon trace.");
251 
252     tstruct = *localtime(&now);
253     strftime(time_buf, time_buf_size, "%Y-%m-%d.%X", &tstruct);
254     snprintf(path_buf, path_buf_size, "/data/misc/anrd/dump_of_anrdaemon.%s", time_buf);
255     int output_fd = creat(path_buf, S_IRWXU);
256     if (output_fd == -1) {
257         ALOGE("Failed to create %s. Dump aborted.", path_buf);
258         return;
259     }
260 
261     if (write(output_fd, header, strlen(header)) != header_len) {
262         ALOGE("Failed to write the header.");
263         close(output_fd);
264         return;
265     }
266 
267     int trace_fd = open(dfs_trace_output_path, O_RDWR);
268     if (trace_fd == -1) {
269         ALOGE("Failed to open %s. Dump aborted.", dfs_trace_output_path);
270         close(output_fd);
271         return;
272     }
273 
274     z_stream zs;
275     uint8_t *in, *out;
276     int result, flush;
277 
278     memset(&zs, 0, sizeof(zs));
279     result = deflateInit(&zs, Z_DEFAULT_COMPRESSION);
280     if (result != Z_OK) {
281         ALOGE("error initializing zlib: %d\n", result);
282         close(trace_fd);
283         close(output_fd);
284         return;
285     }
286 
287     const size_t bufSize = 64 * 1024;
288     in = (uint8_t*)malloc(bufSize);
289     out = (uint8_t*)malloc(bufSize);
290     flush = Z_NO_FLUSH;
291 
292     zs.next_out = out;
293     zs.avail_out = bufSize;
294 
295     do {
296         if (zs.avail_in == 0) {
297             result = read(trace_fd, in, bufSize);
298             if (result < 0) {
299                 ALOGE("error reading trace: %s", strerror(errno));
300                 result = Z_STREAM_END;
301                 break;
302             } else if (result == 0) {
303                 flush = Z_FINISH;
304             } else {
305                 zs.next_in = in;
306                 zs.avail_in = result;
307             }
308         }
309 
310         if (zs.avail_out == 0) {
311             result = write(output_fd, out, bufSize);
312             if ((size_t)result < bufSize) {
313                 ALOGE("error writing deflated trace: %s", strerror(errno));
314                 result = Z_STREAM_END;
315                 zs.avail_out = bufSize;
316                 break;
317             }
318             zs.next_out = out;
319             zs.avail_out = bufSize;
320         }
321 
322     } while ((result = deflate(&zs, flush)) == Z_OK);
323 
324     if (result != Z_STREAM_END) {
325         ALOGE("error deflating trace: %s\n", zs.msg);
326     }
327 
328     if (zs.avail_out < bufSize) {
329         size_t bytes = bufSize - zs.avail_out;
330         result = write(output_fd, out, bytes);
331         if ((size_t)result < bytes) {
332             ALOGE("error writing deflated trace: %s", strerror(errno));
333         }
334     }
335 
336     result = deflateEnd(&zs);
337     if (result != Z_OK) {
338         ALOGE("error cleaning up zlib: %d\n", result);
339     }
340 
341     free(in);
342     free(out);
343 
344     close(trace_fd);
345     close(output_fd);
346 
347     ALOGI("Finished dump. Output file stored at: %s", path_buf);
348 }
349 
350 /*
351  * Start logging when cpu usage is high. Meanwhile, moniter the cpu usage and
352  * stop logging when it drops down.
353  */
start_tracing(void)354 static void start_tracing(void) {
355     ALOGD("High cpu usage, start logging.");
356 
357     if (dfs_enable(true, dfs_control_path) != 0) {
358         ALOGE("Failed to start tracing.");
359         return;
360     }
361     tracing = true;
362 
363     /* Stop logging when cpu usage drops or the daemon is suspended.*/
364     do {
365         usleep(tracing_check_period);
366     } while (!suspend && !dump_requested && is_heavy_load());
367 
368     if (dfs_enable(false, dfs_control_path) != 0) {
369         ALOGE("Failed to stop tracing.");
370         err = true;
371         return;
372     }
373     tracing = false;
374 
375     if (suspend) {
376         ALOGI("trace stopped due to suspend. Send SIGCONT to resume.");
377     } else if (dump_requested) {
378         ALOGI("trace stopped due to dump request.");
379         dump_trace();
380         dump_requested = false;
381     } else {
382         ALOGD("Usage back to low, stop logging.");
383     }
384 }
385 
386 /*
387  * Set the tracing log buffer size.
388  * Note the actual buffer size will be buf_size_kb * number of cores.
389  */
set_tracing_buffer_size(void)390 static int set_tracing_buffer_size(void) {
391     int fd = open(dfs_buffer_size_path, O_WRONLY);
392     if (fd == -1) {
393         err = true;
394         snprintf(err_msg, sizeof(err_msg), "Can't open atrace buffer size file under /d/tracing.");
395         return -1;
396     }
397     ssize_t len = strlen(buf_size_kb);
398     if (write(fd, buf_size_kb, len) != len) {
399         err = true;
400         snprintf(err_msg, sizeof(err_msg), "Error in writing to atrace buffer size file.");
401     }
402     close(fd);
403     return (err ? -1 : 0);
404 }
405 
406 /*
407  * Main loop to moniter the cpu usage and decided whether to start logging.
408  */
start(void)409 static void start(void) {
410     if ((set_tracing_buffer_size()) != 0) return;
411 
412     dfs_set_property(tag, apps, true);
413     dfs_poke_binder();
414 
415     get_cpu_stat(&old_cpu);
416     sleep(check_period);
417 
418     while (!quit && !err) {
419         if (!suspend && is_heavy_load()) {
420             /*
421              * Increase process priority to make sure we can stop logging when
422              * necessary and do not overwrite the buffer
423              */
424             setpriority(PRIO_PROCESS, 0, -20);
425             start_tracing();
426             setpriority(PRIO_PROCESS, 0, 0);
427         }
428         sleep(check_period);
429     }
430     return;
431 }
432 
433 /*
434  * If trace is not running, dump trace right away.
435  * If trace is running, request to dump trace.
436  */
request_dump_trace()437 static void request_dump_trace() {
438     if (!tracing) {
439         dump_trace();
440     } else if (!dump_requested) {
441         dump_requested = true;
442     }
443 }
444 
handle_signal(int signo)445 static void handle_signal(int signo) {
446     switch (signo) {
447         case SIGQUIT:
448             suspend = true;
449             quit = true;
450             break;
451         case SIGSTOP:
452             suspend = true;
453             break;
454         case SIGCONT:
455             suspend = false;
456             break;
457         case SIGUSR1:
458             request_dump_trace();
459     }
460 }
461 
462 /*
463  * Set the signal handler:
464  * SIGQUIT: Reset debugfs and tracing property and terminate the daemon.
465  * SIGSTOP: Stop logging and suspend the daemon.
466  * SIGCONT: Resume the daemon as normal.
467  * SIGUSR1: Dump the logging to a compressed format for systrace to visualize.
468  */
register_sighandler(void)469 static void register_sighandler(void) {
470     struct sigaction sa;
471     sigset_t block_mask;
472 
473     sigemptyset(&block_mask);
474     sigaddset(&block_mask, SIGQUIT);
475     sigaddset(&block_mask, SIGSTOP);
476     sigaddset(&block_mask, SIGCONT);
477     sigaddset(&block_mask, SIGUSR1);
478 
479     sa.sa_flags = 0;
480     sa.sa_mask = block_mask;
481     sa.sa_handler = handle_signal;
482     sigaction(SIGQUIT, &sa, NULL);
483     sigaction(SIGSTOP, &sa, NULL);
484     sigaction(SIGCONT, &sa, NULL);
485     sigaction(SIGUSR1, &sa, NULL);
486 }
487 
show_help(void)488 static void show_help(void) {
489     fprintf(stderr, "usage: anrd [options] [categoris...]\n");
490     fprintf(stdout,
491             "Options includes:\n"
492             "   -a appname  enable app-level tracing for a comma "
493             "separated list of cmdlines\n"
494             "   -t N        cpu threshold for logging to start "
495             "(uint = 0.01%%, min = 5000, max = 9999, default = 9990)\n"
496             "   -s N        use a trace buffer size of N KB "
497             "default to 2048KB\n"
498             "   -h          show helps\n");
499     fprintf(stdout,
500             "Categoris includes:\n"
501             "   am         - activity manager\n"
502             "   sm         - sync manager\n"
503             "   input      - input\n"
504             "   dalvik     - dalvik VM\n"
505             "   audio      - Audio\n"
506             "   gfx        - Graphics\n"
507             "   rs         - RenderScript\n"
508             "   hal        - Hardware Modules\n"
509             "   irq        - kernel irq events\n"
510             "   sched      - kernel scheduler activity\n"
511             "   stack      - kernel stack\n"
512             "   sync       - kernel sync activity\n"
513             "   workq      - kernel work queues\n");
514     fprintf(stdout,
515             "Control includes:\n"
516             "   SIGQUIT: terminate the process\n"
517             "   SIGSTOP: suspend all function of the daemon\n"
518             "   SIGCONT: resume the normal function\n"
519             "   SIGUSR1: dump the current logging in a compressed form\n");
520     exit(0);
521 }
522 
get_options(int argc,char * argv[])523 static int get_options(int argc, char* argv[]) {
524     int opt = 0;
525     int threshold;
526     while ((opt = getopt(argc, argv, "a:s:t:h")) >= 0) {
527         switch (opt) {
528             case 'a':
529                 apps = optarg;
530                 break;
531             case 's':
532                 if (atoi(optarg) > max_buffer_size)
533                     buf_size_kb = max_buffer_size_str;
534                 else if (atoi(optarg) < min_buffer_size)
535                     buf_size_kb = min_buffer_size_str;
536                 else
537                     buf_size_kb = optarg;
538                 break;
539             case 't':
540                 threshold = atoi(optarg);
541                 if (threshold > 9999 || threshold < 5000) {
542                     fprintf(stderr, "logging threshold should be 5000-9999\n");
543                     return 1;
544                 }
545                 idle_threshold = 10000 - threshold;
546                 break;
547             case 'h':
548                 show_help();
549                 break;
550             default:
551                 fprintf(stderr,
552                         "Error in getting options.\n"
553                         "run \"%s -h\" for usage.\n",
554                         argv[0]);
555                 return 1;
556         }
557     }
558 
559     for (int i = optind; i < argc; i++) {
560         if (strcmp(argv[i], "am") == 0) {
561             tag |= ATRACE_TAG_ACTIVITY_MANAGER;
562         } else if (strcmp(argv[i], "input") == 0) {
563             tag |= ATRACE_TAG_INPUT;
564         } else if (strcmp(argv[i], "sm") == 0) {
565             tag |= ATRACE_TAG_SYNC_MANAGER;
566         } else if (strcmp(argv[i], "dalvik") == 0) {
567             tag |= ATRACE_TAG_DALVIK;
568         } else if (strcmp(argv[i], "gfx") == 0) {
569             tag |= ATRACE_TAG_GRAPHICS;
570         } else if (strcmp(argv[i], "audio") == 0) {
571             tag |= ATRACE_TAG_AUDIO;
572         } else if (strcmp(argv[i], "hal") == 0) {
573             tag |= ATRACE_TAG_HAL;
574         } else if (strcmp(argv[i], "rs") == 0) {
575             tag |= ATRACE_TAG_RS;
576         } else if (strcmp(argv[i], "sched") == 0) {
577             log_sched = true;
578         } else if (strcmp(argv[i], "stack") == 0) {
579             log_stack = true;
580         } else if (strcmp(argv[i], "workq") == 0) {
581             log_workq = true;
582         } else if (strcmp(argv[i], "irq") == 0) {
583             log_irq = true;
584         } else if (strcmp(argv[i], "sync") == 0) {
585             log_sync = true;
586         } else {
587             fprintf(stderr,
588                     "invalid category: %s\n"
589                     "run \"%s -h\" for usage.\n",
590                     argv[i], argv[0]);
591             return 1;
592         }
593     }
594 
595     /* If nothing is enabled, don't run */
596     if (!tag && !log_sched && !log_stack && !log_workq && !log_irq && !log_sync) {
597         ALOGE("Specify at least one category to trace.");
598         return 1;
599     }
600 
601     return 0;
602 }
603 
main(int argc,char * argv[])604 int main(int argc, char* argv[]) {
605     if (get_options(argc, argv) != 0) return 1;
606 
607     if (daemon(0, 0) != 0) return 1;
608 
609     register_sighandler();
610 
611     /* Clear any the trace log file by overwrite it with a new file */
612     int fd = creat(dfs_trace_output_path, 0);
613     if (fd == -1) {
614         ALOGE("Faield to open and cleaup previous log");
615         return 1;
616     }
617     close(fd);
618 
619     ALOGI("ANRdaemon starting");
620     start();
621 
622     if (err) ALOGE("ANRdaemon stopped due to Error: %s", err_msg);
623 
624     ALOGI("ANRdaemon terminated.");
625 
626     return (err ? 1 : 0);
627 }
628