1 /*
2 * Copyright 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "debuggerd/handler.h"
18
19 #include <errno.h>
20 #include <fcntl.h>
21 #include <inttypes.h>
22 #include <linux/futex.h>
23 #include <pthread.h>
24 #include <sched.h>
25 #include <signal.h>
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <sys/capability.h>
32 #include <sys/mman.h>
33 #include <sys/prctl.h>
34 #include <sys/socket.h>
35 #include <sys/syscall.h>
36 #include <sys/uio.h>
37 #include <sys/un.h>
38 #include <sys/wait.h>
39 #include <unistd.h>
40
41 #include <android-base/macros.h>
42 #include <android-base/parsebool.h>
43 #include <android-base/properties.h>
44 #include <android-base/unique_fd.h>
45 #include <async_safe/log.h>
46 #include <bionic/reserved_signals.h>
47
48 #include <libdebuggerd/utility.h>
49
50 #include "dump_type.h"
51 #include "protocol.h"
52
53 #include "handler/fallback.h"
54
55 using ::android::base::ParseBool;
56 using ::android::base::ParseBoolResult;
57 using ::android::base::Pipe;
58
59 // We muck with our fds in a 'thread' that doesn't share the same fd table.
60 // Close fds in that thread with a raw close syscall instead of going through libc.
61 struct FdsanBypassCloser {
CloseFdsanBypassCloser62 static void Close(int fd) {
63 syscall(__NR_close, fd);
64 }
65 };
66
67 using unique_fd = android::base::unique_fd_impl<FdsanBypassCloser>;
68
69 // see man(2) prctl, specifically the section about PR_GET_NAME
70 #define MAX_TASK_NAME_LEN (16)
71
72 #if defined(__LP64__)
73 #define CRASH_DUMP_NAME "crash_dump64"
74 #else
75 #define CRASH_DUMP_NAME "crash_dump32"
76 #endif
77
78 #define CRASH_DUMP_PATH "/apex/com.android.runtime/bin/" CRASH_DUMP_NAME
79
80 // Wrappers that directly invoke the respective syscalls, in case the cached values are invalid.
81 #pragma GCC poison getpid gettid
__getpid()82 static pid_t __getpid() {
83 return syscall(__NR_getpid);
84 }
85
__gettid()86 static pid_t __gettid() {
87 return syscall(__NR_gettid);
88 }
89
property_parse_bool(const char * name)90 static bool property_parse_bool(const char* name) {
91 const prop_info* pi = __system_property_find(name);
92 if (!pi) return false;
93 bool cookie = false;
94 __system_property_read_callback(
95 pi,
96 [](void* cookie, const char*, const char* value, uint32_t) {
97 *reinterpret_cast<bool*>(cookie) = ParseBool(value) == ParseBoolResult::kTrue;
98 },
99 &cookie);
100 return cookie;
101 }
102
is_permissive_mte()103 static bool is_permissive_mte() {
104 // Environment variable for testing or local use from shell.
105 char* permissive_env = getenv("MTE_PERMISSIVE");
106 char process_sysprop_name[512];
107 async_safe_format_buffer(process_sysprop_name, sizeof(process_sysprop_name),
108 "persist.device_config.memory_safety_native.permissive.process.%s",
109 getprogname());
110 // DO NOT REPLACE this with GetBoolProperty. That uses std::string which allocates, so it is
111 // not async-safe, and this function gets used in a signal handler.
112 return property_parse_bool("persist.sys.mte.permissive") ||
113 property_parse_bool("persist.device_config.memory_safety_native.permissive.default") ||
114 property_parse_bool(process_sysprop_name) ||
115 (permissive_env && ParseBool(permissive_env) == ParseBoolResult::kTrue);
116 }
117
futex_wait(volatile void * ftx,int value)118 static inline void futex_wait(volatile void* ftx, int value) {
119 syscall(__NR_futex, ftx, FUTEX_WAIT, value, nullptr, nullptr, 0);
120 }
121
122 class ErrnoRestorer {
123 public:
ErrnoRestorer()124 ErrnoRestorer() : saved_errno_(errno) {
125 }
126
~ErrnoRestorer()127 ~ErrnoRestorer() {
128 errno = saved_errno_;
129 }
130
131 private:
132 int saved_errno_;
133 };
134
135 extern "C" void* android_fdsan_get_fd_table();
136 extern "C" void debuggerd_fallback_handler(siginfo_t*, ucontext_t*, void*);
137
138 static debuggerd_callbacks_t g_callbacks;
139
140 // Mutex to ensure only one crashing thread dumps itself.
141 static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER;
142
143 // Don't use async_safe_fatal because it exits via abort, which might put us back into
144 // a signal handler.
fatal(const char * fmt,...)145 static void __noreturn __printflike(1, 2) fatal(const char* fmt, ...) {
146 va_list args;
147 va_start(args, fmt);
148 async_safe_format_log_va_list(ANDROID_LOG_FATAL, "libc", fmt, args);
149 _exit(1);
150 }
151
fatal_errno(const char * fmt,...)152 static void __noreturn __printflike(1, 2) fatal_errno(const char* fmt, ...) {
153 int err = errno;
154 va_list args;
155 va_start(args, fmt);
156
157 char buf[256];
158 async_safe_format_buffer_va_list(buf, sizeof(buf), fmt, args);
159 fatal("%s: %s", buf, strerror(err));
160 }
161
get_main_thread_name(char * buf,size_t len)162 static bool get_main_thread_name(char* buf, size_t len) {
163 unique_fd fd(open("/proc/self/comm", O_RDONLY | O_CLOEXEC));
164 if (fd == -1) {
165 return false;
166 }
167
168 ssize_t rc = read(fd, buf, len);
169 if (rc == -1) {
170 return false;
171 } else if (rc == 0) {
172 // Should never happen?
173 return false;
174 }
175
176 // There's a trailing newline, replace it with a NUL.
177 buf[rc - 1] = '\0';
178 return true;
179 }
180
181 /*
182 * Writes a summary of the signal to the log file. We do this so that, if
183 * for some reason we're not able to contact debuggerd, there is still some
184 * indication of the failure in the log.
185 *
186 * We could be here as a result of native heap corruption, or while a
187 * mutex is being held, so we don't want to use any libc functions that
188 * could allocate memory or hold a lock.
189 */
log_signal_summary(const siginfo_t * si)190 static void log_signal_summary(const siginfo_t* si) {
191 char main_thread_name[MAX_TASK_NAME_LEN + 1];
192 if (!get_main_thread_name(main_thread_name, sizeof(main_thread_name))) {
193 strncpy(main_thread_name, "<unknown>", sizeof(main_thread_name));
194 }
195
196 if (si->si_signo == BIONIC_SIGNAL_DEBUGGER) {
197 async_safe_format_log(ANDROID_LOG_INFO, "libc", "Requested dump for pid %d (%s)", __getpid(),
198 main_thread_name);
199 return;
200 }
201
202 // Many signals don't have a sender or extra detail, but some do...
203 pid_t self_pid = __getpid();
204 char sender_desc[32] = {}; // " from pid 1234, uid 666"
205 if (signal_has_sender(si, self_pid)) {
206 get_signal_sender(sender_desc, sizeof(sender_desc), si);
207 }
208 char extra_desc[32] = {}; // ", fault addr 0x1234" or ", syscall 1234"
209 if (si->si_signo == SIGSYS && si->si_code == SYS_SECCOMP) {
210 async_safe_format_buffer(extra_desc, sizeof(extra_desc), ", syscall %d", si->si_syscall);
211 } else if (signal_has_si_addr(si)) {
212 async_safe_format_buffer(extra_desc, sizeof(extra_desc), ", fault addr %p", si->si_addr);
213 }
214
215 char thread_name[MAX_TASK_NAME_LEN + 1]; // one more for termination
216 if (prctl(PR_GET_NAME, reinterpret_cast<unsigned long>(thread_name), 0, 0, 0) != 0) {
217 strcpy(thread_name, "<name unknown>");
218 } else {
219 // short names are null terminated by prctl, but the man page
220 // implies that 16 byte names are not.
221 thread_name[MAX_TASK_NAME_LEN] = 0;
222 }
223
224 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
225 "Fatal signal %d (%s), code %d (%s%s)%s in tid %d (%s), pid %d (%s)",
226 si->si_signo, get_signame(si), si->si_code, get_sigcode(si), sender_desc,
227 extra_desc, __gettid(), thread_name, self_pid, main_thread_name);
228 }
229
230 /*
231 * Returns true if the handler for signal "signum" has SA_SIGINFO set.
232 */
have_siginfo(int signum)233 static bool have_siginfo(int signum) {
234 struct sigaction old_action;
235 if (sigaction(signum, nullptr, &old_action) < 0) {
236 async_safe_format_log(ANDROID_LOG_WARN, "libc", "Failed testing for SA_SIGINFO: %s",
237 strerror(errno));
238 return false;
239 }
240 return (old_action.sa_flags & SA_SIGINFO) != 0;
241 }
242
raise_caps()243 static void raise_caps() {
244 // Raise CapInh to match CapPrm, so that we can set the ambient bits.
245 __user_cap_header_struct capheader;
246 memset(&capheader, 0, sizeof(capheader));
247 capheader.version = _LINUX_CAPABILITY_VERSION_3;
248 capheader.pid = 0;
249
250 __user_cap_data_struct capdata[2];
251 if (capget(&capheader, &capdata[0]) == -1) {
252 fatal_errno("capget failed");
253 }
254
255 if (capdata[0].permitted != capdata[0].inheritable ||
256 capdata[1].permitted != capdata[1].inheritable) {
257 capdata[0].inheritable = capdata[0].permitted;
258 capdata[1].inheritable = capdata[1].permitted;
259
260 if (capset(&capheader, &capdata[0]) == -1) {
261 async_safe_format_log(ANDROID_LOG_ERROR, "libc", "capset failed: %s", strerror(errno));
262 }
263 }
264
265 // Set the ambient capability bits so that crash_dump gets all of our caps and can ptrace us.
266 uint64_t capmask = capdata[0].inheritable;
267 capmask |= static_cast<uint64_t>(capdata[1].inheritable) << 32;
268 for (unsigned long i = 0; i < 64; ++i) {
269 if (capmask & (1ULL << i)) {
270 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0, 0) != 0) {
271 async_safe_format_log(ANDROID_LOG_ERROR, "libc",
272 "failed to raise ambient capability %lu: %s", i, strerror(errno));
273 }
274 }
275 }
276 }
277
278 // Double-clone, with CLONE_FILES to share the file descriptor table for kcmp validation.
279 // Returns 0 in the orphaned child, the pid of the orphan in the original process, or -1 on failure.
create_vm_process()280 static void create_vm_process() {
281 pid_t first = clone(nullptr, nullptr, CLONE_FILES, nullptr);
282 if (first == -1) {
283 fatal_errno("failed to clone vm process");
284 } else if (first == 0) {
285 drop_capabilities();
286
287 if (clone(nullptr, nullptr, CLONE_FILES, nullptr) == -1) {
288 _exit(errno);
289 }
290
291 // crash_dump is ptracing both sides of the fork; it'll let the parent exit,
292 // but keep the orphan stopped to peek at its memory.
293
294 // There appears to be a bug in the kernel where our death causes SIGHUP to
295 // be sent to our process group if we exit while it has stopped jobs (e.g.
296 // because of wait_for_debugger). Use setsid to create a new process group to
297 // avoid hitting this.
298 setsid();
299
300 _exit(0);
301 }
302
303 int status;
304 if (TEMP_FAILURE_RETRY(waitpid(first, &status, __WCLONE)) != first) {
305 fatal_errno("failed to waitpid in double fork");
306 } else if (!WIFEXITED(status)) {
307 fatal("intermediate process didn't exit cleanly in double fork (status = %d)", status);
308 } else if (WEXITSTATUS(status)) {
309 fatal("second clone failed: %s", strerror(WEXITSTATUS(status)));
310 }
311 }
312
313 struct debugger_thread_info {
314 pid_t crashing_tid;
315 pid_t pseudothread_tid;
316 siginfo_t* siginfo;
317 void* ucontext;
318 debugger_process_info process_info;
319 };
320
321 // Logging and contacting debuggerd requires free file descriptors, which we might not have.
322 // Work around this by spawning a "thread" that shares its parent's address space, but not its file
323 // descriptor table, so that we can close random file descriptors without affecting the original
324 // process. Note that this doesn't go through pthread_create, so TLS is shared with the spawning
325 // process.
326 static void* pseudothread_stack;
327
get_dump_type(const debugger_thread_info * thread_info)328 static DebuggerdDumpType get_dump_type(const debugger_thread_info* thread_info) {
329 if (thread_info->siginfo->si_signo == BIONIC_SIGNAL_DEBUGGER &&
330 thread_info->siginfo->si_value.sival_int) {
331 return kDebuggerdNativeBacktrace;
332 }
333
334 return kDebuggerdTombstoneProto;
335 }
336
debuggerd_dispatch_pseudothread(void * arg)337 static int debuggerd_dispatch_pseudothread(void* arg) {
338 debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);
339
340 for (int i = 0; i < 1024; ++i) {
341 // Don't use close to avoid bionic's file descriptor ownership checks.
342 syscall(__NR_close, i);
343 }
344
345 int devnull = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR));
346 if (devnull == -1) {
347 fatal_errno("failed to open /dev/null");
348 } else if (devnull != 0) {
349 fatal_errno("expected /dev/null fd to be 0, actually %d", devnull);
350 }
351
352 // devnull will be 0.
353 TEMP_FAILURE_RETRY(dup2(devnull, 1));
354 TEMP_FAILURE_RETRY(dup2(devnull, 2));
355
356 unique_fd input_read, input_write;
357 unique_fd output_read, output_write;
358 if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
359 fatal_errno("failed to create pipe");
360 }
361
362 uint32_t version;
363 ssize_t expected;
364
365 // ucontext_t is absurdly large on AArch64, so piece it together manually with writev.
366 struct iovec iovs[4] = {
367 {.iov_base = &version, .iov_len = sizeof(version)},
368 {.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
369 {.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
370 };
371
372 constexpr size_t kHeaderSize = sizeof(version) + sizeof(siginfo_t) + sizeof(ucontext_t);
373
374 if (thread_info->process_info.fdsan_table) {
375 // Dynamic executables always use version 4. There is no need to increment the version number if
376 // the format changes, because the sender (linker) and receiver (crash_dump) are version locked.
377 version = 4;
378 expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic);
379
380 static_assert(sizeof(CrashInfoHeader) + sizeof(CrashInfoDataDynamic) ==
381 kHeaderSize + sizeof(thread_info->process_info),
382 "Wire protocol structs do not match the data sent.");
383 #define ASSERT_SAME_OFFSET(MEMBER1, MEMBER2) \
384 static_assert(sizeof(CrashInfoHeader) + offsetof(CrashInfoDataDynamic, MEMBER1) == \
385 kHeaderSize + offsetof(debugger_process_info, MEMBER2), \
386 "Wire protocol offset does not match data sent: " #MEMBER1);
387 ASSERT_SAME_OFFSET(fdsan_table_address, fdsan_table);
388 ASSERT_SAME_OFFSET(gwp_asan_state, gwp_asan_state);
389 ASSERT_SAME_OFFSET(gwp_asan_metadata, gwp_asan_metadata);
390 ASSERT_SAME_OFFSET(scudo_stack_depot, scudo_stack_depot);
391 ASSERT_SAME_OFFSET(scudo_region_info, scudo_region_info);
392 ASSERT_SAME_OFFSET(scudo_ring_buffer, scudo_ring_buffer);
393 ASSERT_SAME_OFFSET(scudo_ring_buffer_size, scudo_ring_buffer_size);
394 ASSERT_SAME_OFFSET(scudo_stack_depot_size, scudo_stack_depot_size);
395 ASSERT_SAME_OFFSET(recoverable_crash, recoverable_crash);
396 ASSERT_SAME_OFFSET(crash_detail_page, crash_detail_page);
397 #undef ASSERT_SAME_OFFSET
398
399 iovs[3] = {.iov_base = &thread_info->process_info,
400 .iov_len = sizeof(thread_info->process_info)};
401 } else {
402 // Static executables always use version 1.
403 version = 1;
404 expected = sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic);
405
406 static_assert(
407 sizeof(CrashInfoHeader) + sizeof(CrashInfoDataStatic) == kHeaderSize + sizeof(uintptr_t),
408 "Wire protocol structs do not match the data sent.");
409
410 iovs[3] = {.iov_base = &thread_info->process_info.abort_msg, .iov_len = sizeof(uintptr_t)};
411 }
412 errno = 0;
413 if (fcntl(output_write.get(), F_SETPIPE_SZ, expected) < static_cast<int>(expected)) {
414 fatal_errno("failed to set pipe buffer size");
415 }
416
417 ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
418 if (rc == -1) {
419 fatal_errno("failed to write crash info");
420 } else if (rc != expected) {
421 fatal("failed to write crash info, wrote %zd bytes, expected %zd", rc, expected);
422 }
423
424 // Don't use fork(2) to avoid calling pthread_atfork handlers.
425 pid_t crash_dump_pid = _Fork();
426 if (crash_dump_pid == -1) {
427 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
428 "failed to fork in debuggerd signal handler: %s", strerror(errno));
429 } else if (crash_dump_pid == 0) {
430 TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
431 TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
432 input_read.reset();
433 input_write.reset();
434 output_read.reset();
435 output_write.reset();
436
437 raise_caps();
438
439 char main_tid[10];
440 char pseudothread_tid[10];
441 char debuggerd_dump_type[10];
442 async_safe_format_buffer(main_tid, sizeof(main_tid), "%d", thread_info->crashing_tid);
443 async_safe_format_buffer(pseudothread_tid, sizeof(pseudothread_tid), "%d",
444 thread_info->pseudothread_tid);
445 async_safe_format_buffer(debuggerd_dump_type, sizeof(debuggerd_dump_type), "%d",
446 get_dump_type(thread_info));
447
448 execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
449 nullptr, nullptr);
450 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
451 strerror(errno));
452 return 1;
453 }
454
455 input_write.reset();
456 output_read.reset();
457
458 // crash_dump will ptrace and pause all of our threads, and then write to the pipe to tell
459 // us to fork off a process to read memory from.
460 char buf[4];
461 rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));
462
463 bool success = false;
464 if (rc == 1 && buf[0] == '\1') {
465 // crash_dump successfully started, and is ptracing us.
466 // Fork off a copy of our address space for it to use.
467 create_vm_process();
468 success = true;
469 } else {
470 // Something went wrong, log it.
471 if (rc == -1) {
472 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "read of IPC pipe failed: %s",
473 strerror(errno));
474 } else if (rc == 0) {
475 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
476 "crash_dump helper failed to exec, or was killed");
477 } else if (rc != 1) {
478 async_safe_format_log(ANDROID_LOG_FATAL, "libc",
479 "read of IPC pipe returned unexpected value: %zd", rc);
480 } else if (buf[0] != '\1') {
481 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper reported failure");
482 }
483 }
484
485 // Don't leave a zombie child.
486 int status;
487 if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {
488 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to wait for crash_dump helper: %s",
489 strerror(errno));
490 } else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
491 async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper crashed or stopped");
492 }
493
494 if (success) {
495 if (thread_info->siginfo->si_signo != BIONIC_SIGNAL_DEBUGGER) {
496 // For crashes, we don't need to minimize pause latency.
497 // Wait for the dump to complete before having the process exit, to avoid being murdered by
498 // ActivityManager or init.
499 TEMP_FAILURE_RETRY(read(input_read, &buf, sizeof(buf)));
500 }
501 }
502
503 return success ? 0 : 1;
504 }
505
resend_signal(siginfo_t * info)506 static void resend_signal(siginfo_t* info) {
507 // Signals can either be fatal or nonfatal.
508 // For fatal signals, crash_dump will send us the signal we crashed with
509 // before resuming us, so that processes using waitpid on us will see that we
510 // exited with the correct exit status (e.g. so that sh will report
511 // "Segmentation fault" instead of "Killed"). For this to work, we need
512 // to deregister our signal handler for that signal before continuing.
513 if (info->si_signo != BIONIC_SIGNAL_DEBUGGER) {
514 signal(info->si_signo, SIG_DFL);
515 int rc = syscall(SYS_rt_tgsigqueueinfo, __getpid(), __gettid(), info->si_signo, info);
516 if (rc != 0) {
517 fatal_errno("failed to resend signal during crash");
518 }
519 }
520 }
521
522 // Handler that does crash dumping by forking and doing the processing in the child.
523 // Do this by ptracing the relevant thread, and then execing debuggerd to do the actual dump.
debuggerd_signal_handler(int signal_number,siginfo_t * info,void * context)524 static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
525 // Make sure we don't change the value of errno, in case a signal comes in between the process
526 // making a syscall and checking errno.
527 ErrnoRestorer restorer;
528
529 auto *ucontext = static_cast<ucontext_t*>(context);
530
531 // It's possible somebody cleared the SA_SIGINFO flag, which would mean
532 // our "info" arg holds an undefined value.
533 if (!have_siginfo(signal_number)) {
534 info = nullptr;
535 }
536
537 struct siginfo dummy_info = {};
538 if (!info) {
539 memset(&dummy_info, 0, sizeof(dummy_info));
540 dummy_info.si_signo = signal_number;
541 dummy_info.si_code = SI_USER;
542 dummy_info.si_pid = __getpid();
543 dummy_info.si_uid = getuid();
544 info = &dummy_info;
545 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
546 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
547 // that contain commit 66dd34a (3.9+). The manpage claims to only allow
548 // negative si_code values that are not SI_TKILL, but 66dd34a changed the
549 // check to allow all si_code values in calls coming from inside the house.
550 }
551
552 debugger_process_info process_info = {};
553 if (g_callbacks.get_process_info) {
554 process_info = g_callbacks.get_process_info();
555 }
556 uintptr_t si_val = reinterpret_cast<uintptr_t>(info->si_ptr);
557 if (signal_number == BIONIC_SIGNAL_DEBUGGER) {
558 // Applications can set abort messages via android_set_abort_message without
559 // actually aborting; ignore those messages in non-fatal dumps.
560 process_info.abort_msg = nullptr;
561 if (info->si_code == SI_QUEUE && info->si_pid == __getpid()) {
562 // Allow for the abort message to be explicitly specified via the sigqueue value.
563 // Keep the bottom bit intact for representing whether we want a backtrace or a tombstone.
564 if (si_val != kDebuggerdFallbackSivalUintptrRequestDump) {
565 process_info.abort_msg = reinterpret_cast<void*>(si_val & ~1);
566 info->si_ptr = reinterpret_cast<void*>(si_val & 1);
567 }
568 }
569 }
570
571 gwp_asan_callbacks_t gwp_asan_callbacks = {};
572 bool recoverable_gwp_asan_crash = false;
573 if (g_callbacks.get_gwp_asan_callbacks != nullptr) {
574 // GWP-ASan catches use-after-free and heap-buffer-overflow by using PROT_NONE
575 // guard pages, which lead to SEGV. Normally, debuggerd prints a bug report
576 // and the process terminates, but in some cases, we actually want to print
577 // the bug report and let the signal handler return, and restart the process.
578 // In order to do that, we need to disable GWP-ASan's guard pages. The
579 // following callbacks handle this case.
580 gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks();
581 if (signal_number == SIGSEGV && signal_has_si_addr(info) &&
582 gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery &&
583 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report &&
584 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report &&
585 gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) {
586 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr);
587 recoverable_gwp_asan_crash = true;
588 process_info.recoverable_crash = true;
589 }
590 }
591
592 if (info->si_signo == SIGSEGV &&
593 (info->si_code == SEGV_MTESERR || info->si_code == SEGV_MTEAERR) && is_permissive_mte()) {
594 process_info.recoverable_crash = true;
595 // If we are in permissive MTE mode, we do not crash, but instead disable MTE on this thread,
596 // and then let the failing instruction be retried. The second time should work (except
597 // if there is another non-MTE fault).
598 int tagged_addr_ctrl = prctl(PR_GET_TAGGED_ADDR_CTRL, 0, 0, 0, 0);
599 if (tagged_addr_ctrl < 0) {
600 fatal_errno("failed to PR_GET_TAGGED_ADDR_CTRL");
601 }
602 tagged_addr_ctrl = (tagged_addr_ctrl & ~PR_MTE_TCF_MASK) | PR_MTE_TCF_NONE;
603 if (prctl(PR_SET_TAGGED_ADDR_CTRL, tagged_addr_ctrl, 0, 0, 0) < 0) {
604 fatal_errno("failed to PR_SET_TAGGED_ADDR_CTRL");
605 }
606 async_safe_format_log(ANDROID_LOG_ERROR, "libc",
607 "MTE ERROR DETECTED BUT RUNNING IN PERMISSIVE MODE. CONTINUING.");
608 pthread_mutex_unlock(&crash_mutex);
609 }
610
611 // If sival_int is ~0, it means that the fallback handler has been called
612 // once before and this function is being called again to dump the stack
613 // of a specific thread. It is possible that the prctl call might return 1,
614 // then return 0 in subsequent calls, so check the sival_int to determine if
615 // the fallback handler should be called first.
616 bool no_new_privs = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0) == 1;
617 if (si_val == kDebuggerdFallbackSivalUintptrRequestDump || no_new_privs) {
618 // This check might be racy if another thread sets NO_NEW_PRIVS, but this should be unlikely,
619 // you can only set NO_NEW_PRIVS to 1, and the effect should be at worst a single missing
620 // ANR trace.
621 debuggerd_fallback_handler(info, ucontext, process_info.abort_msg);
622 if (no_new_privs && recoverable_gwp_asan_crash) {
623 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
624 return;
625 }
626 resend_signal(info);
627 return;
628 }
629
630 // Only allow one thread to handle a signal at a time.
631 int ret = pthread_mutex_lock(&crash_mutex);
632 if (ret != 0) {
633 async_safe_format_log(ANDROID_LOG_INFO, "libc", "pthread_mutex_lock failed: %s", strerror(ret));
634 return;
635 }
636
637 log_signal_summary(info);
638
639 // If we got here due to the signal BIONIC_SIGNAL_DEBUGGER, it's possible
640 // this is not the main thread, which can cause the intercept logic to fail
641 // since the intercept is only looking for the main thread. In this case,
642 // setting crashing_tid to pid instead of the current thread's tid avoids
643 // the problem.
644 debugger_thread_info thread_info = {
645 .crashing_tid = (signal_number == BIONIC_SIGNAL_DEBUGGER) ? __getpid() : __gettid(),
646 .pseudothread_tid = -1,
647 .siginfo = info,
648 .ucontext = context,
649 .process_info = process_info,
650 };
651
652 // Set PR_SET_DUMPABLE to 1, so that crash_dump can ptrace us.
653 int orig_dumpable = prctl(PR_GET_DUMPABLE);
654 if (prctl(PR_SET_DUMPABLE, 1) != 0) {
655 fatal_errno("failed to set dumpable");
656 }
657
658 // On kernels with yama_ptrace enabled, also allow any process to attach.
659 bool restore_orig_ptracer = true;
660 if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) != 0) {
661 if (errno == EINVAL) {
662 // This kernel does not support PR_SET_PTRACER_ANY, or Yama is not enabled.
663 restore_orig_ptracer = false;
664 } else {
665 fatal_errno("failed to set traceable");
666 }
667 }
668
669 // Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
670 // exhaustion.
671 pid_t child_pid =
672 clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
673 CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
674 &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
675 if (child_pid == -1) {
676 fatal_errno("failed to spawn debuggerd dispatch thread");
677 }
678
679 // Wait for the child to start...
680 futex_wait(&thread_info.pseudothread_tid, -1);
681
682 // and then wait for it to terminate.
683 futex_wait(&thread_info.pseudothread_tid, child_pid);
684
685 // Restore PR_SET_DUMPABLE to its original value.
686 if (prctl(PR_SET_DUMPABLE, orig_dumpable) != 0) {
687 fatal_errno("failed to restore dumpable");
688 }
689
690 // Restore PR_SET_PTRACER to its original value.
691 if (restore_orig_ptracer && prctl(PR_SET_PTRACER, 0) != 0) {
692 fatal_errno("failed to restore traceable");
693 }
694
695 if (info->si_signo == BIONIC_SIGNAL_DEBUGGER) {
696 // If the signal is fatal, don't unlock the mutex to prevent other crashing threads from
697 // starting to dump right before our death.
698 pthread_mutex_unlock(&crash_mutex);
699 } else if (process_info.recoverable_crash) {
700 if (recoverable_gwp_asan_crash) {
701 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
702 }
703 pthread_mutex_unlock(&crash_mutex);
704 }
705 #ifdef __aarch64__
706 else if (info->si_signo == SIGSEGV && info->si_code == SEGV_MTEAERR && getppid() == 1) {
707 // Back channel to init (see system/core/init/service.cpp) to signal that
708 // this process crashed due to an ASYNC MTE fault and should be considered
709 // for upgrade to SYNC mode. We are re-using the ART profiler signal, which
710 // is always handled (ignored in native processes, handled for generating a
711 // dump in ART processes), so a process will never crash from this signal
712 // except from here.
713 // The kernel is not particularly receptive to adding this information:
714 // https://lore.kernel.org/all/20220909180617.374238-1-fmayer@google.com/, so we work around
715 // like this.
716 info->si_signo = BIONIC_SIGNAL_ART_PROFILER;
717 resend_signal(info);
718 }
719 #endif
720 else {
721 // Resend the signal, so that either the debugger or the parent's waitpid sees it.
722 resend_signal(info);
723 }
724 }
725
debuggerd_init(debuggerd_callbacks_t * callbacks)726 void debuggerd_init(debuggerd_callbacks_t* callbacks) {
727 if (callbacks) {
728 g_callbacks = *callbacks;
729 }
730
731 size_t thread_stack_pages = 8;
732 void* thread_stack_allocation = mmap(nullptr, getpagesize() * (thread_stack_pages + 2), PROT_NONE,
733 MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
734 if (thread_stack_allocation == MAP_FAILED) {
735 fatal_errno("failed to allocate debuggerd thread stack");
736 }
737
738 char* stack = static_cast<char*>(thread_stack_allocation) + getpagesize();
739 if (mprotect(stack, getpagesize() * thread_stack_pages, PROT_READ | PROT_WRITE) != 0) {
740 fatal_errno("failed to mprotect debuggerd thread stack");
741 }
742
743 // Stack grows negatively, set it to the last byte in the page...
744 stack = (stack + thread_stack_pages * getpagesize() - 1);
745 // and align it.
746 stack -= 15;
747 pseudothread_stack = stack;
748
749 struct sigaction action;
750 memset(&action, 0, sizeof(action));
751 sigfillset(&action.sa_mask);
752 action.sa_sigaction = debuggerd_signal_handler;
753 action.sa_flags = SA_RESTART | SA_SIGINFO;
754
755 // Use the alternate signal stack if available so we can catch stack overflows.
756 action.sa_flags |= SA_ONSTACK;
757
758 #define SA_EXPOSE_TAGBITS 0x00000800
759 // Request that the kernel set tag bits in the fault address. This is necessary for diagnosing MTE
760 // faults.
761 action.sa_flags |= SA_EXPOSE_TAGBITS;
762
763 debuggerd_register_handlers(&action);
764 }
765
debuggerd_handle_gwp_asan_signal(int signal_number,siginfo_t * info,void * context)766 bool debuggerd_handle_gwp_asan_signal(int signal_number, siginfo_t* info, void* context) {
767 if (g_callbacks.get_gwp_asan_callbacks == nullptr) return false;
768 gwp_asan_callbacks_t gwp_asan_callbacks = g_callbacks.get_gwp_asan_callbacks();
769 if (gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery == nullptr ||
770 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report == nullptr ||
771 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report == nullptr ||
772 !gwp_asan_callbacks.debuggerd_needs_gwp_asan_recovery(info->si_addr)) {
773 return false;
774 }
775
776 // Only dump a crash report for the first GWP-ASan crash. ActivityManager
777 // doesn't like it when an app crashes multiple times, and is even more strict
778 // about an app crashing multiple times in a short time period. While the app
779 // won't crash fully when we do GWP-ASan recovery, ActivityManager still gets
780 // the information about the crash through the DropBoxManager service. If an
781 // app has multiple back-to-back GWP-ASan crashes, this would lead to the app
782 // being killed, which defeats the purpose of having the recoverable mode. To
783 // mitigate against this, only generate a debuggerd crash report for the first
784 // GWP-ASan crash encountered. We still need to do the patching up of the
785 // allocator though, so do that.
786 static pthread_mutex_t first_crash_mutex = PTHREAD_MUTEX_INITIALIZER;
787 pthread_mutex_lock(&first_crash_mutex);
788 static bool first_crash = true;
789
790 if (first_crash) {
791 // `debuggerd_signal_handler` will call
792 // `debuggerd_gwp_asan_(pre|post)_crash_report`, so no need to manually call
793 // them here.
794 debuggerd_signal_handler(signal_number, info, context);
795 first_crash = false;
796 } else {
797 gwp_asan_callbacks.debuggerd_gwp_asan_pre_crash_report(info->si_addr);
798 gwp_asan_callbacks.debuggerd_gwp_asan_post_crash_report(info->si_addr);
799 }
800
801 pthread_mutex_unlock(&first_crash_mutex);
802 return true;
803 }
804
805 // When debuggerd's signal handler is the first handler called, it's great at
806 // handling the recoverable GWP-ASan and permissive MTE modes. For apps,
807 // sigchain (from libart) is always the first signal handler, and so the
808 // following function is what sigchain must call before processing the signal.
809 // This allows for processing of a potentially recoverable GWP-ASan or MTE
810 // crash. If the signal requires recovery, then dump a report (via the regular
811 // debuggerd hanndler), and patch up the allocator (in the case of GWP-ASan) or
812 // disable MTE on the thread, and allow the process to continue (indicated by
813 // returning 'true'). If the crash has nothing to do with GWP-ASan/MTE, or
814 // recovery isn't possible, return 'false'.
debuggerd_handle_signal(int signal_number,siginfo_t * info,void * context)815 bool debuggerd_handle_signal(int signal_number, siginfo_t* info, void* context) {
816 if (signal_number != SIGSEGV) return false;
817 if (info->si_code == SEGV_MTEAERR || info->si_code == SEGV_MTESERR) {
818 if (!is_permissive_mte()) return false;
819 // Because permissive MTE disables MTE for the entire thread, we're less
820 // worried about getting a whole bunch of crashes in a row. ActivityManager
821 // doesn't like multiple native crashes for an app in a short period of time
822 // (see the comment about recoverable GWP-ASan in
823 // `debuggerd_handle_gwp_asan_signal`), but that shouldn't happen if MTE is
824 // disabled for the entire thread. This might need to be changed if there's
825 // some low-hanging bug that happens across multiple threads in quick
826 // succession.
827 debuggerd_signal_handler(signal_number, info, context);
828 return true;
829 }
830
831 if (!signal_has_si_addr(info)) return false;
832 return debuggerd_handle_gwp_asan_signal(signal_number, info, context);
833 }
834