android 系统稳定性日志抓取之WATCHDOG

android 系统稳定性日志抓取之WATCHDOG

本文不再写watchdog的原理,给大家推荐袁辉辉大神的博客WatchDog工作原理 - Gityuan博客 | 袁辉辉的技术博客,直接讲发生watchdog后android系统如何抓取log,以及抓取log后分别怎么使用log定位问题。本文基于当前最新代码androidR(android11)的aosp分析

先截取gityuan博客中最后的一段话:

watchdog在check过程中出现阻塞1分钟的情况,则会输出:
  AMS.dumpStackTraces
  抓取系统重要进程的当前每个线程的堆栈快照
  :输出system_server和3个native进程的traces
    该方法会输出两次,第一次在超时30s的地方;第二次在超时1min;
  
  WD.dumpKernelStackTraces,输出system_server进程中所有线程的kernel stack;
    节点/proc/%d/task获取进程内所有的线程列表
    节点/proc/%d/stack获取kernel的栈
  
  doSysRq, 触发kernel来dump所有阻塞线程,输出所有CPU的backtrace到kernel log;
    节点/proc/sysrq-trigger
  
  dropBox,输出文件到/data/system/dropbox,内容是trace + blocked信息
  
  杀掉system_server,进而触发zygote进程自杀,从而重启上层framework

1 AMS.dumpStackTraces

1.1 获取2个进程pid列表

抓取系统重要进程的当前每个线程的堆栈快照。关键进程是哪些呢,那就得伺候一段代码了

ArrayList<Integer> pids = new ArrayList<>(mInterestingJavaPids);
...
final File stack = ActivityManagerService.dumpStackTraces(
    pids, processCpuTracker, new SparseArray<>(),
    getInterestingNativePids(),
    tracesFileException);

可以看到有mInterestingJavaPids,getInterestingNativePids(),被分为了感兴趣的java进程和native进程。java进程的pid搜集比较简单,systemserver本身以及app进程都受ams管控

//Watchdog.java:350 构造watchdog对象的时候先把systemserver自身pid加入列表中
mInterestingJavaPids.add(Process.myPid());

//Watchdog.java 在每个app进程启动以及死亡的时候分别把app进程pid加入队列或者从中删除
/**
* Notifies the watchdog when a Java process with {@code pid} is started.
* This process may have its stack trace dumped during an ANR.
*/
public void processStarted(String processName, int pid) {
  //此处判断是否感兴趣
  if (isInterestingJavaProcess(processName)) {
      Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
      synchronized (this) {
          mInterestingJavaPids.add(pid);
      }
  }
}

/**
* Notifies the watchdog when a Java process with {@code pid} dies.
*/
public void processDied(String processName, int pid) {
  if (isInterestingJavaProcess(processName)) {
      Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
      synchronized (this) {
          mInterestingJavaPids.remove(Integer.valueOf(pid));
      }
  }
}

//上面俩函数的HOOK点为 ProcessList.java
//2607行 Watchdog.getInstance().processStarted(app.processName, pid);
//4007行 Watchdog.getInstance().processDied(app.processName, app.pid);

//判断感兴趣通过如下逻辑,主要是phone进程和MediaProvider进程
private static boolean isInterestingJavaProcess(String processName) {
        return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
                || processName.equals("com.android.phone");
}

native进程收集分为俩部分,一是如surfaceflinger、mediaserver、vold等重要的native进程,二是HAL服务进程

// Which native processes to dump into dropbox's stack traces
public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
  "/system/bin/audioserver",
  "/system/bin/cameraserver",
  "/system/bin/drmserver",
  "/system/bin/mediadrmserver",
  "/system/bin/mediaserver",
  "/system/bin/netd",
  "/system/bin/sdcard",
  "/system/bin/surfaceflinger",
  "/system/bin/vold",
  "media.extractor", // system/bin/mediaextractor
  "media.metrics", // system/bin/mediametrics
  "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
  "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
  "com.android.bluetooth",  // Bluetooth service
  "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
};

public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
  "android.hardware.audio@2.0::IDevicesFactory",
  "android.hardware.audio@4.0::IDevicesFactory",
  "android.hardware.audio@5.0::IDevicesFactory",
  "android.hardware.audio@6.0::IDevicesFactory",
  "android.hardware.biometrics.face@1.0::IBiometricsFace",
  "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
  "android.hardware.bluetooth@1.0::IBluetoothHci",
  "android.hardware.camera.provider@2.4::ICameraProvider",
  "android.hardware.gnss@1.0::IGnss",
  "android.hardware.graphics.allocator@2.0::IAllocator",
  "android.hardware.graphics.composer@2.1::IComposer",
  "android.hardware.health@2.0::IHealth",
  "android.hardware.light@2.0::ILight",
  "android.hardware.media.c2@1.0::IComponentStore",
  "android.hardware.media.omx@1.0::IOmx",
  "android.hardware.media.omx@1.0::IOmxStore",
  "android.hardware.neuralnetworks@1.0::IDevice",
  "android.hardware.power.stats@1.0::IPowerStats",
  "android.hardware.sensors@1.0::ISensors",
  "android.hardware.sensors@2.0::ISensors",
  "android.hardware.sensors@2.1::ISensors",
  "android.hardware.vr@1.0::IVr",
  "android.system.suspend@1.0::ISystemSuspend"
);

//获取hal进程pid是通过servicemanager.dumpDebug实现
//获取重要native进程是通过Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST)

1.2 ActivityManagerService.dumpStackTraces

几个重载函数,包含了创建trace文件,对java进程调用dumpJavaTracesTombstoned,对native进程调用Debug.dumpNativeBacktraceToFileTimeout

public static File dumpStackTraces(ArrayList<Integer> firstPids,
        ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
        ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile) {
    return dumpStackTraces(firstPids, processCpuTracker, lastPids, nativePids,
            logExceptionCreatingFile, null);
}

/* package */ static File dumpStackTraces(ArrayList<Integer> firstPids,
      ProcessCpuTracker processCpuTracker, SparseArray<Boolean> lastPids,
      ArrayList<Integer> nativePids, StringWriter logExceptionCreatingFile,
      long[] firstPidOffsets){
      ...
      File tracesFile;
      try {
          tracesFile = createAnrDumpFile(tracesDir);
      } catch (IOException e) {
        ...
      }
      Pair<Long, Long> offsets = dumpStackTraces(
                tracesFile.getAbsolutePath(), firstPids, nativePids, extraPids);
      return tracesFile;
}

public static Pair<Long, Long> dumpStackTraces(String tracesFile, ArrayList<Integer> firstPids,
            ArrayList<Integer> nativePids, ArrayList<Integer> extraPids) {
      if (firstPids != null) {
            int num = firstPids.size();
            for (int i = 0; i < num; i++) {
                final int pid = firstPids.get(i);
                ...
                final long timeTaken = dumpJavaTracesTombstoned(pid,tracesFile,remainingTime);
                ...
            }
      }
      ...
      if (nativePids != null) {
            for (int pid : nativePids) {
                ...
                Debug.dumpNativeBacktraceToFileTimeout(
                        pid, tracesFile, (int) (nativeDumpTimeoutMs / 1000));
                ...
            }
        }
        ...
        if (extraPids != null) {
            for (int pid : extraPids) {
                ...
                final long timeTaken = dumpJavaTracesTombstoned(pid, tracesFile, remainingTime);
                ...
            }
        }
        ...
}

分别对java进程和native进程做不同的处理,先看java进程的处理

private static long dumpJavaTracesTombstoned(int pid, String fileName, long timeoutMs) {
        ...
        boolean javaSuccess = Debug.dumpJavaBacktraceToFileTimeout(pid, fileName,
                (int) (timeoutMs / 1000));
        ...
    }

转手调用了Debug.dumpJavaBacktraceToFileTimeout,前后进行了耗时计算。也就是说java和native进程最终都走向了Debug.java中的函数中,且这俩函数都是native函数

//Debug.java
public static native boolean dumpJavaBacktraceToFileTimeout(int pid, String file,
                                                            int timeoutSecs);
public static native boolean dumpNativeBacktraceToFileTimeout(int pid, String file,
                                                                  int timeoutSecs);

直接看jni实现

//android_os_Debug.cpp
static jboolean android_os_Debug_dumpJavaBacktraceToFileTimeout(JNIEnv* env, jobject clazz,
        jint pid, jstring fileName, jint timeoutSecs) {
    const bool ret = dumpTraces(env, pid, fileName, timeoutSecs, kDebuggerdJavaBacktrace);
    return ret ? JNI_TRUE : JNI_FALSE;
}

static jboolean android_os_Debug_dumpNativeBacktraceToFileTimeout(JNIEnv* env, jobject clazz,
        jint pid, jstring fileName, jint timeoutSecs) {
    const bool ret = dumpTraces(env, pid, fileName, timeoutSecs, kDebuggerdNativeBacktrace);
    return ret ? JNI_TRUE : JNI_FALSE;
}

殊途同归,最后走到了dumpTraces函数,只是最后一个枚举类型的参数标识了java或native

//system/core/debuggerd/common/include/dump_type.h
enum DebuggerdDumpType : uint8_t {
  kDebuggerdNativeBacktrace,
  kDebuggerdTombstone,
  kDebuggerdJavaBacktrace,
  kDebuggerdAnyIntercept
};

接下来继续看dumpTraces

static bool dumpTraces(JNIEnv* env, jint pid, jstring fileName, jint timeoutSecs,
                       DebuggerdDumpType dumpType) {
    const ScopedUtfChars fileNameChars(env, fileName);
    if (fileNameChars.c_str() == nullptr) {
        return false;
    }
    //只写模式打开java层传下来的文件
    android::base::unique_fd fd(open(fileNameChars.c_str(),
                                     O_CREAT | O_WRONLY | O_NOFOLLOW | O_CLOEXEC | O_APPEND,
                                     0666));
    if (fd < 0) {
        PLOG(ERROR) << "Can't open " << fileNameChars.c_str();
        return false;
    }
    //在timeoutSecs时间内完成把堆栈打印到文件
    int res = dump_backtrace_to_file_timeout(pid, dumpType, timeoutSecs, fd);
    //落盘
    if (fdatasync(fd.get()) != 0) {
        PLOG(ERROR) << "Failed flushing trace.";
    }
    return res == 0;
}

dump_backtrace_to_file_timeout函数是debuggerd_client库中的函数system/core/debuggerd/client/debuggerd_client.cpp

int dump_backtrace_to_file_timeout(pid_t tid, DebuggerdDumpType dump_type, int timeout_secs,
                                   int fd) {
  ...//dup fd之类的活,不用管
  //这个才是关键点
  int ret = debuggerd_trigger_dump(tid, dump_type, timeout_ms, std::move(copy)) ? 0 : -1;
  ...
  return ret;
}

触发信号给相应pid的进程

bool debuggerd_trigger_dump(pid_t tid, DebuggerdDumpType dump_type, unsigned int timeout_ms,
                            unique_fd output_fd) {
  pid_t pid = tid;
  ...//省略这段代码是java进程如果重启了就更新一下pid的逻辑

  //前面调用dump_backtrace_to_file_timeout的时候设置了超时,下面是超时逻辑
  LOG(INFO) << "libdebuggerd_client: started dumping process " << pid;
  unique_fd sockfd;
  //设置结束时间的时间点 当前时刻+timeout
  const auto end = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeout_ms);
  //计算剩余时间的函数 局部变量
  auto time_left = [&end]() { return end - std::chrono::steady_clock::now(); };
  //设置超时的回调函数 局部变量
  auto set_timeout = [timeout_ms, &time_left](int sockfd) {
    if (timeout_ms <= 0) {
      return sockfd;
    }
    //计算剩余时间
    auto remaining = time_left();
    //如果没有剩余时间则触发timeout,返回错误fd
    if (remaining < decltype(remaining)::zero()) {
      LOG(ERROR) << "libdebuggerd_client: timeout expired";
      return -1;
    }

    struct timeval timeout;
    populate_timeval(&timeout, remaining);
    //设置接收超时
    if (setsockopt(sockfd, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout)) != 0) {
      PLOG(ERROR) << "libdebuggerd_client: failed to set receive timeout";
      return -1;
    }
    //设置发送超时
    if (setsockopt(sockfd, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout)) != 0) {
      PLOG(ERROR) << "libdebuggerd_client: failed to set send timeout";
      return -1;
    }
    返回正常fd,可以用来读取
    return sockfd;
  };

  //初始化socket
  sockfd.reset(socket(AF_LOCAL, SOCK_SEQPACKET, 0));
  if (sockfd == -1) {
    PLOG(ERROR) << "libdebugger_client: failed to create socket";
    return false;
  }
  
  //连接名称为"tombstoned_intercept"的localsocket,用来发送拦截tombstone请求,该连接有超时机制
  if (socket_local_client_connect(set_timeout(sockfd.get()), kTombstonedInterceptSocketName,
                                  ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET) == -1) {
    PLOG(ERROR) << "libdebuggerd_client: failed to connect to tombstoned";
    return false;
  }

  //拦截请求结构体,用来告诉tomestone我们要拦截进程号为pid,类型为dump_type的一次dump请求
  InterceptRequest req = {
      .dump_type = dump_type,
      .pid = pid,
  };
  //再次更新timeout
  if (!set_timeout(sockfd)) {
    PLOG(ERROR) << "libdebugger_client: failed to set timeout";
    return false;
  }

  //创建一对未命名管道,产生一对管道fd
  // Create an intermediate pipe to pass to the other end.
  unique_fd pipe_read, pipe_write;
  if (!Pipe(&pipe_read, &pipe_write)) {
    PLOG(ERROR) << "libdebuggerd_client: failed to create pipe";
    return false;
  }

  //设置管道buffer size准备,1M或者读取节点的值
  std::string pipe_size_str;
  int pipe_buffer_size = 1024 * 1024;
  if (android::base::ReadFileToString("/proc/sys/fs/pipe-max-size", &pipe_size_str)) {
    pipe_size_str = android::base::Trim(pipe_size_str);

    if (!android::base::ParseInt(pipe_size_str.c_str(), &pipe_buffer_size, 0)) {
      LOG(FATAL) << "failed to parse pipe max size '" << pipe_size_str << "'";
    }
  }
  
  //设置管道buffer size
  if (fcntl(pipe_read.get(), F_SETPIPE_SZ, pipe_buffer_size) != pipe_buffer_size) {
    PLOG(ERROR) << "failed to set pipe buffer size";
  }

  //通过之前连接的"tombstoned_intercept" 的socket通道把管道写端的fd发送到tombstored进程
  ssize_t rc = SendFileDescriptors(set_timeout(sockfd), &req, sizeof(req), pipe_write.get());
  //这边不需要管道写端的fd
  pipe_write.reset();
  if (rc != sizeof(req)) {
    PLOG(ERROR) << "libdebuggerd_client: failed to send output fd to tombstoned";
    return false;
  }

  //阻塞等待接收拦截请求的结果并对结果进行处理开始{@
  // Check to make sure we've successfully registered.
  InterceptResponse response;
  rc = TEMP_FAILURE_RETRY(recv(set_timeout(sockfd.get()), &response, sizeof(response), MSG_TRUNC));
  ...//省略一下返回结果的处理
  //阻塞等待接收拦截请求的结果并对结果进行处理结束@}

  //这里是整个流程最关键的点,发送信号给需要dumpbacktrace的java或native进程
  if (!send_signal(tid, dump_type)) {
    return false;
  }

  //发完信号后边可以阻塞等待接收之前连接的"tombstoned_intercept"的socket返回
  rc = TEMP_FAILURE_RETRY(recv(set_timeout(sockfd.get()), &response, sizeof(response), MSG_TRUNC));
  ...//省略一些返回结果的处理

  //tombstoned在接收到进程dumpbacktrace的结果后会把文本通过前面发送过去的管道写端写入,这里进行循环读取并写入文件
  // Forward output from the pipe to the output fd.
  while (true) {
    ...//超时逻辑

    struct pollfd pfd = {
        .fd = pipe_read.get(), .events = POLLIN, .revents = 0,
    };
    //监控管道读端的fd
    rc = poll(&pfd, 1, remaining_ms);
    ...//poll结果处理
    
    //正常poll到有数据的时候进行读取
    char buf[1024];
    rc = TEMP_FAILURE_RETRY(read(pipe_read.get(), buf, sizeof(buf)));
    ...//读取结果或错误处理

    //读取到数据直接写入文件
    if (!android::base::WriteFully(output_fd.get(), buf, rc)) {
      PLOG(ERROR) << "libdebuggerd_client: error while writing";
      return false;
    }
  }

  LOG(INFO) << "libdebuggerd_client: done dumping process " << pid;

  return true;
}

最重要的点是对需要dumpbacktrace的目标进程发出了信号,至于发出的信号是啥还要分析一段

//java进程发生kill -3,native进程是kill -35
static bool send_signal(pid_t pid, const DebuggerdDumpType dump_type) {
  const int signal = (dump_type == kDebuggerdJavaBacktrace) ? SIGQUIT : BIONIC_SIGNAL_DEBUGGER;
  sigval val;
  val.sival_int = (dump_type == kDebuggerdNativeBacktrace) ? 1 : 0;

  if (sigqueue(pid, signal, val) != 0) {
    PLOG(ERROR) << "libdebuggerd_client: failed to send signal to pid " << pid;
    return false;
  }
  return true;
}

到这里ActivityManagerService.dumpStackTraces发起端的分析就完了。

2 目标进程处理信号

接下来分析接收端(即目标进程)收到linux signal后的响应流程。在android系统中,无论是java进程还是无虚拟机实例的native进程都是从elf格式的可执行文件开始的,可执行文件执行流程推荐看一下这篇文档android linker 浅析_dinuliang的专栏-CSDN博客。其中有说到
ELF文件的加载,内核创建好进程后首先调用load_elf_binary将其映像文件映射到内存,然后映射并执行其解释器也就是linker的代码。linker的代码段是进程间共享的,但数据段为各进程私有。linker执行完后会自动跳转到目标映像的入口地址。在android中,linker代码的运行域由地址0xb0000100开始(see /bionic/linker/Android.mk),直接从_start开始执行。do_execve会预先将应用程序参数(argc,argv[],envc和envp[]还有一些"辅助向量(Auxiliary Vector)"等(see load_elf_binary>create_elf_tables))存放在分配好的用户空间堆栈中,通过堆栈将这些参数和指针(位于linux_binprm结构体bprm中)传递给用户空间的目标进程。 Linker会提取出它所需要的信息,例如目标映像中程序头表在用户空间的地址以及应用程序入口等。Linker会首先调用__linker_init执行一段自举代码,完成其自身的初始化,初始化与目标映像相关的数据结构。Linker会首先调用alloc_info为目标映像分配一个soinfo结构体,它用于存放与映像文件有关的所有信息,这样可以使可执行映像与共享对象共享连接与重定位函数,后面的程序将通过soinfo的flags域判断目标映像是共享库还是可执行程序。
我们从android linker的函数__linker_init开始说起:

//bionic/linker/linker_main.cpp
extern "C" ElfW(Addr) __linker_init(void* raw_args) {
  ...//省略一些linker初始化的代码
  //调用__linker_init_post_relocation
  return __linker_init_post_relocation(args, tmp_linker_so);
}

static ElfW(Addr) __attribute__((noinline))
__linker_init_post_relocation(KernelArgumentBlock& args, soinfo& tmp_linker_so) {
  ...
  //然后到linker_main函数
  ElfW(Addr) start_address = linker_main(args, exe_to_load);
  ...
  return start_address;
}

static ElfW(Addr) linker_main(KernelArgumentBlock& args, const char* exe_to_load) {
  ...
  // Register the debuggerd signal handler.
  linker_debuggerd_init();
  ...
}

可以看到是linker在初始化的时候注册了debuggerd的信号处理函数

//bionic/linker/linker_debuggerd_android.cpp
void linker_debuggerd_init() {
  ...
  
  //调用了debuggerd模块中的debuggerd_init
  debuggerd_init(&callbacks);
}

所有的elf文件运行后都会有linker加载和初始化,linker编译的时候链接了静态库libdebuggerd_handler_fallback,所以elf文件本身不用动态或静态链接libdebuggerd_handler_fallback,而是透过linker来使用libdebuggerd_handler_fallback里面的函数。libdebuggerd_handler_fallback代码在system/core/debuggerd目录下

//system/core/debuggerd/handler/debuggerd_handler.cpp

void debuggerd_init(debuggerd_callbacks_t* callbacks) {
  ...
  
  struct sigaction action;
  memset(&action, 0, sizeof(action));
  sigfillset(&action.sa_mask);
  //这个debuggerd_signal_handler就是收到信号的回调函数
  action.sa_sigaction = debuggerd_signal_handler;
  action.sa_flags = SA_RESTART | SA_SIGINFO;

  // Use the alternate signal stack if available so we can catch stack overflows.
  action.sa_flags |= SA_ONSTACK;
  //注册信号处理函数
  debuggerd_register_handlers(&action);
}

//system/core/debuggerd/include/debuggerd/handler.h
static void __attribute__((__unused__)) debuggerd_register_handlers(struct sigaction* action) {
  ...//省略了通过属性判断是debugger版本固件则监听的信号
  
  //#define BIONIC_SIGNAL_DEBUGGER (__SIGRTMIN + 3)
  //35 (__SIGRTMIN + 3)        debuggerd
  sigaction(BIONIC_SIGNAL_DEBUGGER, action, nullptr);
}

接下来看信号回调函数debuggerd_signal_handler,在信号处理回调函数中,调用clone复制了一个跟目标进程共享了PID/TID和内存空间,却不共享proc文件系统和打开fd的新进程。目标进程收到信号35后,会把主线程的执行上下文保存,然后切换上下文到信号处理函数,所以信号处理函数其实是在主线程执行的,但是原来主线程的堆栈被保存下来了,信号处理结束后可以恢复到信号处理前的执行状态。其他线程不变。下面的代码为信号处理函数实现,可看到clone出新进程后,信号处理函数所线程(主线程)会等待新进程结束,当信号处理函数return后恢复主线程现场。从clone调用的flag可以看到,新建的进程是跟目标进程共享内存空间,共享TID、PID的新进程,我们暂且称之为“目标进程的伪进程”,简称伪进程。目标进程和伪进程共享儿子进程。

//system/core/debuggerd/handler/debuggerd_handler.cpp
// Handler that does crash dumping by forking and doing the processing in the child.(通过在子进程中派生和执行处理来崩溃转储的处理程序)
// Do this by ptracing the relevant thread, and then execing debuggerd to do the actual dump.(通过对相关线程执行ptrace,然后执行debuggerd来执行实际的转储)
static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void* context) {
  ...//兼容校正等处理、以及对asan的特殊处理

  //传输信息给clone出来的新进程,在后面clone调用中作为参数传入
  debugger_thread_info thread_info = {
      .crashing_tid = __gettid(),
      .pseudothread_tid = -1,
      .siginfo = info,
      .ucontext = context,
      .abort_msg = reinterpret_cast<uintptr_t>(abort_message),
      .fdsan_table = reinterpret_cast<uintptr_t>(android_fdsan_get_fd_table()),
      .gwp_asan_state = reinterpret_cast<uintptr_t>(gwp_asan_state),
      .gwp_asan_metadata = reinterpret_cast<uintptr_t>(gwp_asan_metadata),
  };
  
  //设置可dump
  // Set PR_SET_DUMPABLE to 1, so that crash_dump can ptrace us.
  int orig_dumpable = prctl(PR_GET_DUMPABLE);
  if (prctl(PR_SET_DUMPABLE, 1) != 0) {
    ...
  }

  //设置可ptrace
  // On kernels with yama_ptrace enabled, also allow any process to attach.
  bool restore_orig_ptracer = true;
  if (prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY) != 0) {
    ...
  }

  // Essentially pthread_create without CLONE_FILES, so we still work during file descriptor
  // exhaustion.
  //clone调用,pseudothread_tid表示clone调用产生的线程id,其实与debuggerd_signal_handler函数的当前线程的tid是同一个
  //从clone参数flag中CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID
  //CLONE_THREAD表示子进程会放到和调用进程同样的线程组里,也就是共享一个PID的一组线程(共享线程)
  //如果CLONE_VM被置位,父子进程在同一个内存空间中运行(共享内存空间)
  //所以clone后产生的新进程共享了目标进程的pid、tid,也共享了内存空间,暂且称之为“目标进程的伪进程”、不过伪进程(未设置CLONE_FILES以及CLONE_FS)并不共享文件系统相关(如/proc或者fd),所以在伪进程中关闭fd不会影响目标进程的fd
  pid_t child_pid =
    clone(debuggerd_dispatch_pseudothread, pseudothread_stack,
          CLONE_THREAD | CLONE_SIGHAND | CLONE_VM | CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID,
          &thread_info, nullptr, nullptr, &thread_info.pseudothread_tid);
  if (child_pid == -1) {
    fatal_errno("failed to spawn debuggerd dispatch thread");
  }

  // Wait for the child to start...等待伪进程开始
  futex_wait(&thread_info.pseudothread_tid, -1);

  // and then wait for it to terminate.等待伪进程结束
  futex_wait(&thread_info.pseudothread_tid, child_pid);

  //说明当前进程是在当clone产生的伪进程运行的过程中被ptrace
  
  //恢复原本是否可dump
  // Restore PR_SET_DUMPABLE to its original value.
  if (prctl(PR_SET_DUMPABLE, orig_dumpable) != 0) {
    fatal_errno("failed to restore dumpable");
  }
  //恢复原本是否可ptrace
  // Restore PR_SET_PTRACER to its original value.
  if (restore_orig_ptracer && prctl(PR_SET_PTRACER, 0) != 0) {
    fatal_errno("failed to restore traceable");
  }
  ...
}

debuggerd_dispatch_pseudothread是clone出来的伪进程的入口函数

static int debuggerd_dispatch_pseudothread(void* arg) {
  //获取目标进程传输过来的信息
  debugger_thread_info* thread_info = static_cast<debugger_thread_info*>(arg);

  //关闭目标进程打开的fd,此进程不使用,可以做到伪进程不输出任何log,但是前面说到伪进程不共享目标进程fd,所以不影响目标进程
  for (int i = 0; i < 1024; ++i) {
    // Don't use close to avoid bionic's file descriptor ownership checks.
    syscall(__NR_close, i);
  }

  int devnull = TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR));
  ...//错误处理

  // devnull will be 0.把stdout和stderr重定向到devnull
  TEMP_FAILURE_RETRY(dup2(devnull, 1));
  TEMP_FAILURE_RETRY(dup2(devnull, 2));

  //创建并打开2对管道fd,“目标进程的伪进程”只是父进程的克隆,这里创建的2对管道对是“伪进程”中还要再fork一次产生一个(跟目标进程共享的)子进程,暂且称为“目标进程的干儿子进程”,简称干儿子进程
  unique_fd input_read, input_write;//读取"干儿子进程"的信息,input_write交给"干儿子进程"使用
  unique_fd output_read, output_write;//写信息给“干儿子进程”,output_write交给"干儿子进程"使用
  if (!Pipe(&input_read, &input_write) != 0 || !Pipe(&output_read, &output_write)) {
    fatal_errno("failed to create pipe");
  }

  ...//省略设置管道buffer大小

  struct iovec iovs[] = {
      {.iov_base = &version, .iov_len = sizeof(version)},
      {.iov_base = thread_info->siginfo, .iov_len = sizeof(siginfo_t)},
      {.iov_base = thread_info->ucontext, .iov_len = sizeof(ucontext_t)},
      {.iov_base = &thread_info->abort_msg, .iov_len = sizeof(uintptr_t)},
      {.iov_base = &thread_info->fdsan_table, .iov_len = sizeof(uintptr_t)},
      {.iov_base = &thread_info->gwp_asan_state, .iov_len = sizeof(uintptr_t)},
      {.iov_base = &thread_info->gwp_asan_metadata, .iov_len = sizeof(uintptr_t)},
  };
  //写给还未创建的“干儿子进程”
  ssize_t rc = TEMP_FAILURE_RETRY(writev(output_write.get(), iovs, arraysize(iovs)));
  ...//错误处理

  // Don't use fork(2) to avoid calling pthread_atfork handlers.
  pid_t crash_dump_pid = __fork();//创建干儿子进程(前面提到的伪进程和目标进程共享所有儿子进程CLONE_CHILD_SETTID | CLONE_CHILD_CLEARTID)
  if (crash_dump_pid == -1) {
    async_safe_format_log(ANDROID_LOG_FATAL, "libc",
                          "failed to fork in debuggerd signal handler: %s", strerror(errno));
  } else if (crash_dump_pid == 0) {
    //干儿子进程入口,先把input_write和output_read分别设置为stdout和stdin,即使后面exec把进程切换为crashdump也是适用的,即crashdump的stdout为input_write,stdin为output_read。
    TEMP_FAILURE_RETRY(dup2(input_write.get(), STDOUT_FILENO));
    TEMP_FAILURE_RETRY(dup2(output_read.get(), STDIN_FILENO));
    
    //下面这四个fd已经不需要
    input_read.reset();
    input_write.reset();
    output_read.reset();
    output_write.reset();

    raise_caps();//提升权限,为运行crashdump做准备

    //exec 前准备参数
    char main_tid[10];//目标进程pid(主线程pid)
    char pseudothread_tid[10];//伪进程pid(一般来说跟主线程pid是一样的)
    char debuggerd_dump_type[10];//java or native
    ...//参数初始化
    
    //这里把干儿子进程通过exec调用替换成crashdump,这里由干儿子进程变幻产生的crashdump记为crashdump一号
    execle(CRASH_DUMP_PATH, CRASH_DUMP_NAME, main_tid, pseudothread_tid, debuggerd_dump_type,
           nullptr, nullptr);
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to exec crash_dump helper: %s",
                          strerror(errno));
    return 1;
  }
  
  //继续伪进程流程。。。
  
  
  input_write.reset();//给干儿子进程(即crashdump一号)使用的可以关闭
  output_read.reset();//给干儿子进程(即crashdump一号)使用的可以关闭

  // crash_dump will ptrace and pause all of our threads, and then write to the pipe to tell
  // us to fork off a process to read memory from.
  char buf[4];
  //阻塞监听crashdump一号的stdout,当stdout输出的时候这里就不再阻塞,继续后面逻辑。伪进程等待crashdump一号写“1”到stdout
  rc = TEMP_FAILURE_RETRY(read(input_read.get(), &buf, sizeof(buf)));
  ...//这里省略了期望rc为1,即读取到一个字节,进行错误处理

  // crash_dump is ptracing us, fork off a copy of our address space for it to use.
  //这个函数里会调用2次clone,由于伪进程会被干儿子进程ptrace设置PTRACE_O_TRACECLONE,所以clone以后都是stop状态,create_vm_process内部对需要resume的进程进行了ptrace cont调用.由于俩次clone的时候都指定了nullptr的入口函数,所以伪进程的任何线程都不会被新的执行体上下文覆盖,所以新进程中的所有线程堆栈跟目标进程一致
  create_vm_process();
  
  //create_vm_process中有下面的注释
  // crash_dump is ptracing both sides of the fork; it'll let the parent exit,
  // but keep the orphan stopped to peek at its memory.
  //说明两次clone产生的俩个新的关系为父子的进程,父进程退出,子进程则托孤,用来读取内存实现堆栈回溯

  // Don't leave a zombie child. //wait干儿子进程,这里就是等待crashdump ptrace自己并且处理结束后退出,以免出现僵尸进程
  int status;
  if (TEMP_FAILURE_RETRY(waitpid(crash_dump_pid, &status, 0)) == -1) {
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "failed to wait for crash_dump helper: %s",
                          strerror(errno));
  } else if (WIFSTOPPED(status) || WIFSIGNALED(status)) {
    async_safe_format_log(ANDROID_LOG_FATAL, "libc", "crash_dump helper crashed or stopped");
  }

  ...//省略一个其他信号(非35)的处理。
  return 0;//伪进程结束
}

伪进程会fork出“干儿子进程”(即crashdump),然后crashdump会把“伪进程”当成靶进程,进行ptrace获取各个线程寄存器,进行堆栈回溯,然后把收集到的堆栈信息发送给tombstoned。tombstoned那边有个拦截,拦截到数据后发送给ActivityManagerService.dumpStackTraces的发起端,watchdog的发起端是systemserver进程。

2.1 crashdump流程

接下来就开会分析crashdump流程,从上文分析可知crashdump是目标进程的子进程进一步fork产生的孙子进程,其中没有userid以及进程组的变化,所以crashdump是有权限ptrace“伪进程”以及目标进程本身的。

//system/core/debuggerd/crash_dump.cpp
int main(int argc, char** argv) {
  ...
  //这里getppid获取的父进程PID就是目标进程以及”伪进程“共享的PID,在伪进程存在时,对PID进程操作对象是伪进程,即将作为靶进程被ptrace并获取堆栈
  pid_t target_process = getppid();

  ...//省略判断进程是否还在的代码

  // Reparent ourselves to init, so that the signal handler can waitpid on the
  // original process to avoid leaving a zombie for non-fatal dumps.
  // Move the input/output pipes off of stdout/stderr, out of paranoia.
  unique_fd output_pipe(dup(STDOUT_FILENO));//输出给父进程(伪进程)的stdout dup成output_pipe
  unique_fd input_pipe(dup(STDIN_FILENO)); //从父进程(伪进程)获取输入的stdin dup成input_pipe

  //这里又生成一段管道对,是为了再fork一次父子间通信的通道,用来通知自己可以exit。也就是说crashdump还会fork一个crashdump进程
  unique_fd fork_exit_read, fork_exit_write;
  if (!Pipe(&fork_exit_read, &fork_exit_write)) {
    PLOG(FATAL) << "failed to create pipe";
  }

  //crashdump作为“目标进程的干儿子进程” 再fork一次后,会产生一个干孙子进程,且称为“目标进程的干孙子进程”,简称干孙子进程
  pid_t forkpid = fork();
  if (forkpid == -1) {
    PLOG(FATAL) << "fork failed";
  } else if (forkpid == 0) {
    fork_exit_read.reset();//不用的fd
  } else {
    // We need the pseudothread to live until we get around to verifying the vm pid against it.
    // The last thing it does is block on a waitpid on us, so wait until our child tells us to die.
    fork_exit_write.reset();//不用的fd
    char buf;
    //(干儿子进程crashdump一号)阻塞读取等待其子进程(这里是”干孙子进程”crashdump二号)通知自己退出
    TEMP_FAILURE_RETRY(read(fork_exit_read.get(), &buf, sizeof(buf)));
    //当自己exit后,fork出来的子进程就会被托孤到init进程,这就是再fork一次的目的,因为打印trace获取到所需要的信息后,目标进程其实可以退出了,这里“干儿子进程”一退出,“伪进程”会退出,最后只留下托孤到init的干孙子进程crashdump二号在善后
    _exit(0);
  }
  
  //这下面的代码可见只有“干孙子进程”crashdump二号可以走到
  
  .../参数处理,超时处理...

  //因为crashdump的进程空间已经跟目标进程不相同,所以只能通过cat /proc下的节点来获取进程信息
  
  // Get the process name (aka cmdline).//读取 /proc/xxx/cmdline
  std::string process_name = get_process_name(g_target_thread);

  // Collect the list of open files.//读取 /proc/xxx/fd/目录
  OpenFilesList open_files;
  {
    ATRACE_NAME("open files");
    populate_open_files_list(&open_files, g_target_thread);
  }

  // In order to reduce the duration that we pause the process for, we ptrace
  // the threads, fetch their registers and associated information, and then
  // fork a separate process as a snapshot of the process's address space.
  std::set<pid_t> threads;
  //通过/proc/xxx/task目录获取靶进程(伪进程和目标进程pid一样)的线程列表
  if (!android::procinfo::GetProcessTids(g_target_thread, &threads)) {
    PLOG(FATAL) << "failed to get process threads";
  }

  std::map<pid_t, ThreadInfo> thread_info;
  siginfo_t siginfo;
  std::string error;

  {
    ATRACE_NAME("ptrace");
    //遍历线程列表进行ptrace
    for (pid_t thread : threads) {
      // Trace the pseudothread separately, so we can use different options.
      if (thread == pseudothread_tid) {
        //pseudothread_tid,其实是信号处理函数所在的线程(记为PT),一般就是主线程,由于该线程(PT)的原有堆栈以及执行上下文(现场)被保存了,切换成信号处理函数,所以直接ptrace获取的信息不是保存前的信息,是信号处理函数的信息,没有意义。所以等到后面单独处理
        continue;
      }
      
      //抓住线程...字面意思
      if (!ptrace_seize_thread(target_proc_fd, thread, &error)) {
        bool fatal = thread == g_target_thread;
        LOG(fatal ? FATAL : WARNING) << error;
      }

      ThreadInfo info;
      info.pid = target_process;
      info.tid = thread;
      info.uid = getuid();
      info.process_name = process_name;
      info.thread_name = get_thread_name(thread);
      
      //暂停线程使其为t状态
      if (!ptrace_interrupt(thread, &info.signo)) {
        PLOG(WARNING) << "failed to ptrace interrupt thread " << thread;
        ptrace(PTRACE_DETACH, thread, 0, 0);
        continue;
      }
      //在主线程读取crash信息,此时进程不一定有crash,一般的watchdog都不会有某个线程crash,只是通用方法
      if (thread == g_target_thread) {
        // Read the thread's registers along with the rest of the crash info out of the pipe.
        ReadCrashInfo(input_pipe, &siginfo, &info.registers, &abort_msg_address,
                      &fdsan_table_address, &gwp_asan_state, &gwp_asan_metadata);
        info.siginfo = &siginfo;
        info.signo = info.siginfo->si_signo;
      } else {
        //采集线程当前帧的寄存器
        info.registers.reset(unwindstack::Regs::RemoteGet(thread));
        if (!info.registers) {
          PLOG(WARNING) << "failed to fetch registers for thread " << thread;
          ptrace(PTRACE_DETACH, thread, 0, 0);
          continue;
        }
      }

      thread_info[thread] = std::move(info);
    }
  }

  //这里便是ptrace设置伪进程PTRACE_O_TRACECLONE,让伪进程下一次调用clone()时将其停止,并自动跟踪新产生的进程,新产生的进程刚开始收到SIGSTOP信号
  // Trace the pseudothread with PTRACE_O_TRACECLONE and tell it to fork.
  if (!ptrace_seize_thread(target_proc_fd, pseudothread_tid, &error, PTRACE_O_TRACECLONE)) {
    LOG(FATAL) << "failed to seize pseudothread: " << error;
  }

  //写一个’1‘到stdout,通知伪进程向下走,去调用create_vm_process进程俩次clone调用
  if (TEMP_FAILURE_RETRY(write(output_pipe.get(), "\1", 1)) != 1) {
    PLOG(FATAL) << "failed to write to pseudothread";
  }

  //等待2次clone成功,通过PTRACE_GETEVENTMSG获取新进程的pid,伪进程 =》子 =》 孙, “子”退出,“孙”就变成孤儿进程(托孤给init),用来提取内存实现
  pid_t vm_pid = wait_for_vm_process(pseudothread_tid);
  if (ptrace(PTRACE_DETACH, pseudothread_tid, 0, 0) != 0) {
    PLOG(FATAL) << "failed to detach from pseudothread";
  }

  // The pseudothread can die now.
  fork_exit_write.reset();

  // Defer the message until later, for readability.
  bool wait_for_gdb = android::base::GetBoolProperty("debug.debuggerd.wait_for_gdb", false);
  if (siginfo.si_signo == BIONIC_SIGNAL_DEBUGGER) {
    wait_for_gdb = false;//可以看出打印backtrace过程waitgdb的逻辑不会走
  }

  // Detach from all of our attached threads before resuming.
  for (const auto& [tid, thread] : thread_info) {
    int resume_signal = thread.signo == BIONIC_SIGNAL_DEBUGGER ? 0 : thread.signo;
    if (wait_for_gdb) {//35信号,该值为false
      resume_signal = 0;
      if (tgkill(target_process, tid, SIGSTOP) != 0) {
        PLOG(WARNING) << "failed to send SIGSTOP to " << tid;
      }
    }

    LOG(DEBUG) << "detaching from thread " << tid;
    if (ptrace(PTRACE_DETACH, tid, 0, resume_signal) != 0) {
      PLOG(ERROR) << "failed to detach from thread " << tid;
    }
  }

  // Drop our capabilities now that we've fetched all of the information we need.
  drop_capabilities();

  {//连接tombstoned的socket
    ATRACE_NAME("tombstoned_connect");
    LOG(INFO) << "obtaining output fd from tombstoned, type: " << dump_type;
    g_tombstoned_connected =
        tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd, dump_type);
  }

  if (g_tombstoned_connected) {//连接状态判断和错误处理
    if (TEMP_FAILURE_RETRY(dup2(g_output_fd.get(), STDOUT_FILENO)) == -1) {
      PLOG(ERROR) << "failed to dup2 output fd (" << g_output_fd.get() << ") to STDOUT_FILENO";
    }
  } else {
    unique_fd devnull(TEMP_FAILURE_RETRY(open("/dev/null", O_RDWR)));
    TEMP_FAILURE_RETRY(dup2(devnull.get(), STDOUT_FILENO));
    g_output_fd = std::move(devnull);
  }

  LOG(INFO) << "performing dump of process " << target_process
            << " (target tid = " << g_target_thread << ")";

  int signo = siginfo.si_signo;
  bool fatal_signal = signo != BIONIC_SIGNAL_DEBUGGER;
  bool backtrace = false;

  // si_value is special when used with BIONIC_SIGNAL_DEBUGGER.
  //   0: dump tombstone
  //   1: dump backtrace
  if (!fatal_signal) {
    int si_val = siginfo.si_value.sival_int;
    if (si_val == 0) {
      backtrace = false;//该分支是发生了fatal类型crash,需要抓墓碑的时候走的
    } else if (si_val == 1) {
      backtrace = true;//不是crash,仅仅是打backtrace,所以走这个分支
    } else {
      LOG(WARNING) << "unknown si_value value " << si_val;
    }
  }

  // TODO: Use seccomp to lock ourselves down.
  unwindstack::UnwinderFromPid unwinder(256, vm_pid);//初始化堆栈回溯器,置顶提取内存的进程pid为前面clone2次产生的新进程
  //Init里面会调用Unwind()函数
  if (!unwinder.Init(unwindstack::Regs::CurrentArch())) {
    LOG(FATAL) << "Failed to init unwinder object.";
  }

  std::string amfd_data;
  if (backtrace) {
    ATRACE_NAME("dump_backtrace");
    //上面完成了所有线程寄存器状态等信息获取,接下来就是把调用栈回溯出来,写给tomestone的socket
    //不过并非所有线程的寄存器都拿到了,其实缺少了被信号处理回调覆盖的线程PT(一般来说是主线程)的寄存器信息,vm_pid中就包含了线程PT的寄存器信息,在unwinder流程里面会有获取寄存器的动作,见本段代码下方函数UnwindStackPtrace::Unwind
    dump_backtrace(std::move(g_output_fd), &unwinder, thread_info, g_target_thread);
  } else {
    //抓墓碑文件的分支,先不分析
    ...//省去墓碑文件分支,可以看出跟backtrace只是打印格式有差异
  }

  if (fatal_signal) {
    //如果是crash则通知ams
    // Don't try to notify ActivityManager if it just crashed, or we might hang until timeout.
    if (thread_info[target_process].thread_name != "system_server") {
      activity_manager_notify(target_process, signo, amfd_data);
    }
  }

  if (wait_for_gdb) {
    // Use ALOGI to line up with output from engrave_tombstone.
    ALOGI(
        "***********************************************************\n"
        "* Process %d has been suspended while crashing.\n"
        "* To attach gdbserver and start gdb, run this on the host:\n"
        "*\n"
        "*     gdbclient.py -p %d\n"
        "*\n"
        "***********************************************************",
        target_process, target_process);
  }

  // Close stdout before we notify tombstoned of completion.
  close(STDOUT_FILENO);
  //通知tomestone写入结束
  if (g_tombstoned_connected && !tombstoned_notify_completion(g_tombstoned_socket.get())) {
    LOG(ERROR) << "failed to notify tombstoned of completion";
  }

  return 0;
}


//system/core/libbacktrace/UnwindStack.cpp
//带默认参数,nullptr
bool UnwindStackPtrace::Unwind(size_t num_ignore_frames, void* context) {
  std::unique_ptr<unwindstack::Regs> regs;
  if (context == nullptr) {//?如果上下文为空,则获取寄存器值,只有主线程的寄存器没有
    regs.reset(unwindstack::Regs::RemoteGet(Tid()));
  } else {
    regs.reset(unwindstack::Regs::CreateFromUcontext(unwindstack::Regs::CurrentArch(), context));
  }

  return Backtrace::Unwind(regs.get(), GetMap(), &frames_, num_ignore_frames, nullptr, &error_);
}


上面流程关键步骤2个,一是获取线程列表的寄存器状态,二是回溯调用栈,下面是对堆栈回溯结果的打印

//system/core/debuggerd/libdebuggerd/backtrace.cpp
void dump_backtrace(android::base::unique_fd output_fd, unwindstack::Unwinder* unwinder,
                    const std::map<pid_t, ThreadInfo>& thread_info, pid_t target_thread) {
  log_t log;
  log.tfd = output_fd.get();
  log.amfd_data = nullptr;

  auto target = thread_info.find(target_thread);
  if (target == thread_info.end()) {
    ALOGE("failed to find target thread in thread info");
    return;
  }

  dump_process_header(&log, target->second.pid, target->second.process_name.c_str());
  //打印native堆栈的头部信息
  //----- pid 1015 at 2020-12-14 15:26:27 -----
  //Cmd line: /system/bin/surfaceflinger
  //ABI: 'arm64'
  
  dump_backtrace_thread(output_fd.get(), unwinder, target->second);
  for (const auto& [tid, info] : thread_info) {
    if (tid != target_thread) {
      dump_backtrace_thread(output_fd.get(), unwinder, info);
    }
  }
  //遍历线程列表,进行线程堆栈打印、第一步Unwind回溯线程帧,第二步把每一帧使用dladdr或addrline通过pc找到symbols name,第三步把symbols name通过__cxa_demangle函数优化输出
  //"surfaceflinger" sysTid=1015
  //  #00 pc 000000000009c248  /apex/com.android.runtime/lib64/bionic/libc.so (__epoll_pwait+8) (BuildId: 7f60442790ed390da057ffcfcdb2ac3c)
  //  #01 pc 0000000000019acc  /system/lib64/libutils.so (android::Looper::pollInner(int)+184) (BuildId: 4e69b93bf70ed592f0029dbd1097529e)
  //  #02 pc 00000000000199ac  /system/lib64/libutils.so (android::Looper::pollOnce(int, int*, int*, void**)+112) (BuildId: 4e69b93bf70ed592f0029dbd1097529e)
  // #03 pc 00000000000fe97c  /system/lib64/libsurfaceflinger.so (android::impl::MessageQueue::waitMessage()+84) (BuildId: 152f8ad9c51082650b23966b3169314c)
  //  #04 pc 00000000001106d0  /system/lib64/libsurfaceflinger.so (android::SurfaceFlinger::run()+20) (BuildId: 152f8ad9c51082650b23966b3169314c)
  //  #05 pc 0000000000002394  /system/bin/surfaceflinger (main+844) (BuildId: c0bf4662a8509b0c305cac8fae2f7c9a)
  //  #06 pc 00000000000499e4  /apex/com.android.runtime/lib64/bionic/libc.so (__libc_init+108) (BuildId: 7f60442790ed390da057ffcfcdb2ac3c)

  dump_process_footer(&log, target->second.pid);
  //----- end 1015 -----
}

crashdump流程是收到信号35处理的,对于java进程35也是可以抓trace的,但是一般对java进程是发送信号3

2.2 虚拟机的信号处理

无论是zygotefork出来的app进程,或是app_process(32/64)命令直接启动的java进程(如monkey),都有一个名为“Signal Catcher”的线程,该线程是在Runtimestart或者postfork流程中启动的线程。在SignalCather的Run函数中做了监听信号发生的操作

//art/runtime/signal_catcher.cc
void* SignalCatcher::Run(void* arg) {
  SignalCatcher* signal_catcher = reinterpret_cast<SignalCatcher*>(arg);
  CHECK(signal_catcher != nullptr);

  Runtime* runtime = Runtime::Current();
  CHECK(runtime->AttachCurrentThread("Signal Catcher", true, runtime->GetSystemThreadGroup(),
                                     !runtime->IsAotCompiler()));

  Thread* self = Thread::Current();
  DCHECK_NE(self->GetState(), kRunnable);
  {
    MutexLock mu(self, signal_catcher->lock_);
    signal_catcher->thread_ = self;
    signal_catcher->cond_.Broadcast(self);
  }

  // Set up mask with signals we want to handle.
  SignalSet signals;
  signals.Add(SIGQUIT);//kill -3 (信号3)
  signals.Add(SIGUSR1);

  while (true) {
    int signal_number = signal_catcher->WaitForSignal(self, signals);//这步是关键
    if (signal_catcher->ShouldHalt()) {
      runtime->DetachCurrentThread();
      return nullptr;
    }

    switch (signal_number) {
    case SIGQUIT:
      signal_catcher->HandleSigQuit();
      break;
    case SIGUSR1:
      signal_catcher->HandleSigUsr1();
      break;
    default:
      LOG(ERROR) << "Unexpected signal %d" << signal_number;
      break;
    }
  }
}

//art/runtime/signal_set.h
int Wait() {
  // Sleep in sigwait() until a signal arrives. gdb causes EINTR failures.
  int signal_number;
  int rc = TEMP_FAILURE_RETRY(sigwait(&set_, &signal_number));//系统调用
  if (rc != 0) {
    PLOG(FATAL) << "sigwait failed";
  }
  return signal_number;
}

当虚拟机收到kill -3信号时,调用HandleSigQuit()

//art/runtime/signal_catcher.cc
void SignalCatcher::HandleSigQuit() {
  Runtime* runtime = Runtime::Current();
  std::ostringstream os;
  os << "\n"
      << "----- pid " << getpid() << " at " << GetIsoDate() << " -----\n";

  DumpCmdLine(os);//简单

  // Note: The strings "Build fingerprint:" and "ABI:" are chosen to match the format used by
  // debuggerd. This allows, for example, the stack tool to work.
  std::string fingerprint = runtime->GetFingerprint();
  os << "Build fingerprint: '" << (fingerprint.empty() ? "unknown" : fingerprint) << "'\n";
  os << "ABI: '" << GetInstructionSetString(runtime->GetInstructionSet()) << "'\n";

  os << "Build type: " << (kIsDebugBuild ? "debug" : "optimized") << "\n";
  
  runtime->DumpForSigQuit(os);
  ...
  os << "----- end " << getpid() << " -----\n";
  Output(os.str());
}

前面打印了trace头部分以及cmdline到ostringstream里面,相对比较简单,
最后Output函数把ostringstream内容通过socket输出给tombstoned,下面代码还是比较清晰的,不用解释了

//art/runtime/signal_catcher.cc
void SignalCatcher::Output(const std::string& s) {
  ScopedThreadStateChange tsc(Thread::Current(), kWaitingForSignalCatcherOutput);
  PaletteStatus status = PaletteWriteCrashThreadStacks(s.data(), s.size());
  ...
}

//system/libartpalette/palette_android.cc
enum PaletteStatus PaletteWriteCrashThreadStacks(/*in*/ const char* stacks, size_t stacks_len) {
    android::base::unique_fd tombstone_fd;
    android::base::unique_fd output_fd;

    if (!tombstoned_connect(getpid(), &tombstone_fd, &output_fd, kDebuggerdJavaBacktrace)) {
      ...
    }

    PaletteStatus status = PaletteStatus::kOkay;
    if (!android::base::WriteFully(output_fd, stacks, stacks_len)) {
        ...
    }

    if (TEMP_FAILURE_RETRY(fdatasync(output_fd)) == -1 && errno != EINVAL) {
        ...
    }

    if (close(output_fd.release()) == -1 && errno != EINTR) {
        ...
    }

    if (!tombstoned_notify_completion(tombstone_fd)) {
        ...
    }

    return status;
}

除去前后打印头部,后面打印尾部以及输出到tombstoned,runtime->DumpForSigQuit是关键部分

void Runtime::DumpForSigQuit(std::ostream& os) {
  //还是trace的头部打印{@
  GetClassLinker()->DumpForSigQuit(os);
  GetInternTable()->DumpForSigQuit(os);
  GetJavaVM()->DumpForSigQuit(os);
  GetHeap()->DumpForSigQuit(os);
  oat_file_manager_->DumpForSigQuit(os);
  if (GetJit() != nullptr) {
    GetJit()->DumpForSigQuit(os);
  } else {
    os << "Running non JIT\n";
  }
  DumpDeoptimizations(os);
  TrackedAllocators::Dump(os);
  os << "\n";
  //@}

  thread_list_->DumpForSigQuit(os);
  //打印锁相关信息
  BaseMutex::DumpAll(os);

  // Inform anyone else who is interested in SigQuit.
  {
    ScopedObjectAccess soa(Thread::Current());
    callbacks_->SigQuit();//信号回调
  }
}

只分析主要流程thread_list_->DumpForSigQuit(os);

//art/runtime/thread_list.cc
void ThreadList::DumpForSigQuit(std::ostream& os) {
  ...
  //获取是否要打印nativestack
  bool dump_native_stack = Runtime::Current()->GetDumpNativeStackOnSigQuit();
  //run checkpoints
  Dump(os, dump_native_stack);
  //遍历线程列表并做unwind和addr2line
  DumpUnattachedThreads(os, dump_native_stack && kDumpUnattachedThreadNativeStackForSigQuit);
}

Dump(os, dump_native_stack)通过runcheckpoints来Dump 所有线程的java stack
DumpUnattachedThreads通过DumpNativeStack来dump所有线程的native stack
先看native的,逻辑与crashdump相似

//art/runtime/thread_list.cc
void ThreadList::DumpUnattachedThreads(std::ostream& os, bool dump_native_stack) {
  DIR* d = opendir("/proc/self/task");//通过task列表
  if (!d) {
    return;
  }

  Thread* self = Thread::Current();
  dirent* e;
  while ((e = readdir(d)) != nullptr) {//遍历线程列表
    char* end;
    pid_t tid = strtol(e->d_name, &end, 10);
    if (!*end) {
      bool contains;
      {
        MutexLock mu(self, *Locks::thread_list_lock_);
        contains = Contains(tid);
      }
      if (!contains) {
        //对于符合条件的线程进行dump
        DumpUnattachedThread(os, tid, dump_native_stack);
      }
    }
  }
  closedir(d);
}

static void DumpUnattachedThread(std::ostream& os, pid_t tid, bool dump_native_stack)
    NO_THREAD_SAFETY_ANALYSIS {
  // TODO: No thread safety analysis as DumpState with a null thread won't access fields, should
  // refactor DumpState to avoid skipping analysis.
  Thread::DumpState(os, nullptr, tid);
  if (dump_native_stack) {
    DumpNativeStack(os, tid, nullptr, "  native: ");
  }
  os << std::endl;
}

//art/runtime/native_stack_dump.cc
void DumpNativeStack(std::ostream& os,
                     pid_t tid,
                     BacktraceMap* existing_map,
                     const char* prefix,
                     ArtMethod* current_method,
                     void* ucontext_ptr,
                     bool skip_frames) {
  ...
  std::unique_ptr<Backtrace> backtrace(Backtrace::Create(BACKTRACE_CURRENT_PROCESS, tid, map));
  backtrace->SetSkipFrames(skip_frames);
  //回溯栈帧frames
  if (!backtrace->Unwind(0, reinterpret_cast<ucontext*>(ucontext_ptr))) {
    ...
  } else if (backtrace->NumFrames() == 0) {
    ...
  }

  ...//省略判断是否使用addr2line命令方式

  std::unique_ptr<Addr2linePipe> addr2line_state;
  
  //遍历线程堆栈的每一栈帧 以各种方式做addr2line的逻辑
  for (Backtrace::const_iterator it = backtrace->begin();
       it != backtrace->end(); ++it) {
    // We produce output like this:
    // ]    #00 pc 000075bb8  /system/lib/libc.so (unwind_backtrace_thread+536)
    // In order for parsing tools to continue to function, the stack dump
    // format must at least adhere to this format:
    //  #XX pc <RELATIVE_ADDR>  <FULL_PATH_TO_SHARED_LIBRARY> ...
    // The parsers require a single space before and after pc, and two spaces
    // after the <RELATIVE_ADDR>. There can be any prefix data before the
    // #XX. <RELATIVE_ADDR> has to be a hex number but with no 0x prefix.
    os << prefix << StringPrintf("#%02zu pc ", it->num);
    bool try_addr2line = false;
    if (!BacktraceMap::IsValid(it->map)) {
      os << StringPrintf(Is64BitInstructionSet(kRuntimeISA) ? "%016" PRIx64 "  ???"
                                                            : "%08" PRIx64 "  ???",
                         it->pc);
    } else {
      os << StringPrintf(Is64BitInstructionSet(kRuntimeISA) ? "%016" PRIx64 "  "
                                                            : "%08" PRIx64 "  ",
                         it->rel_pc);
      if (it->map.name.empty()) {
        os << StringPrintf("<anonymous:%" PRIx64 ">", it->map.start);
      } else {
        os << it->map.name;
      }
      if (it->map.offset != 0) {
        os << StringPrintf(" (offset %" PRIx64 ")", it->map.offset);
      }
      os << " (";
      if (!it->func_name.empty()) {
        os << it->func_name;
        if (it->func_offset != 0) {
          os << "+" << it->func_offset;
        }
        // Functions found using the gdb jit interface will be in an empty
        // map that cannot be found using addr2line.
        if (!it->map.name.empty()) {
          try_addr2line = true;
        }
      } else if (current_method != nullptr &&
          Locks::mutator_lock_->IsSharedHeld(Thread::Current()) &&
          PcIsWithinQuickCode(current_method, it->pc)) {
        const void* start_of_code = current_method->GetEntryPointFromQuickCompiledCode();
        os << current_method->JniLongName() << "+"
           << (it->pc - reinterpret_cast<uint64_t>(start_of_code));
      } else {
        os << "???";
      }
      os << ")";
    }
    os << std::endl;
    if (try_addr2line && use_addr2line) {
      Addr2line(it->map.name, it->rel_pc, os, prefix, &addr2line_state);
    }
  }

  if (addr2line_state != nullptr) {
    Drain(0, prefix, &addr2line_state, os);
  }
}

再看javastack的dump

//art/runtime/thread_list.cc
void ThreadList::Dump(std::ostream& os, bool dump_native_stack) {
  ...
  if (self != nullptr) {
    DumpCheckpoint checkpoint(&os, dump_native_stack);
    size_t threads_running_checkpoint;
    {
      // Use SOA to prevent deadlocks if multiple threads are calling Dump() at the same time.
      ScopedObjectAccess soa(self);
      //这里是关键流程RunCheckpoint
      threads_running_checkpoint = RunCheckpoint(&checkpoint);
    }
    ...
  } else {
    ...
  }
}

//带默认参数size_t RunCheckpoint(Closure* checkpoint_function, Closure* callback = nullptr)
size_t ThreadList::RunCheckpoint(Closure* checkpoint_function, Closure* callback) {
  Thread* self = Thread::Current();
  Locks::mutator_lock_->AssertNotExclusiveHeld(self);
  Locks::thread_list_lock_->AssertNotHeld(self);
  Locks::thread_suspend_count_lock_->AssertNotHeld(self);

  std::vector<Thread*> suspended_count_modified_threads;
  size_t count = 0;
  {
    ...
    for (const auto& thread : list_) {
      if (thread != self) {
        bool requested_suspend = false;
        ...//省略判断requested_suspend且重置状态流程
        if (requested_suspend) {
          //判断成功后加入向量
          suspended_count_modified_threads.push_back(thread);
        }
      }
    }
    ...
  }

  // Run the checkpoint on ourself while we wait for threads to suspend.
  checkpoint_function->Run(self);

  // Run the checkpoint on the suspended threads.
  for (const auto& thread : suspended_count_modified_threads) {
    // We know for sure that the thread is suspended at this point.
    DCHECK(thread->IsSuspended());
    checkpoint_function->Run(thread);
    {
      ...
    }
  }

  ...

  return count;
}

//最终都是针对每个线程thread对象调用了checkpoint_function->Run(thread);而这个checkpoint_function 是前面声明的DumpCheckpoint对象,那就直接看DumpCheckpoint::Run函数实现,
class DumpCheckpoint final : public Closure {
 public:
  ...
  void Run(Thread* thread) override {
    ...
    std::ostringstream local_os;
    {
      ScopedObjectAccess soa(self);
      //调用具体Thread对象的dump函数
      thread->Dump(local_os, dump_native_stack_, backtrace_map_.get());
    }
    ...
  }
...
}

//具体每个线程的Dump逻辑
//art/runtime/thread.cc
void Thread::Dump(std::ostream& os, bool dump_native_stack, BacktraceMap* backtrace_map,
                  bool force_dump_stack) const {
  DumpState(os);
  DumpStack(os, dump_native_stack, backtrace_map, force_dump_stack);
}

//DumpState(os);举例:
"HwBinder:17453_2" prio=5 tid=62 Native
  | group="main" sCount=1 dsCount=0 flags=1 obj=0x15545a08 self=0xb40000735cd359e0
  | sysTid=17881 nice=-2 cgrp=foreground sched=0/0 handle=0x7158944cc0
  | state=S schedstat=( 4532920 3957026 77 ) utm=0 stm=0 core=6 HZ=100
  | stack=0x715884d000-0x715884f000 stackSize=995KB
  | held mutexes=

//DumpStack逻辑
void Thread::DumpStack(std::ostream& os,
                       bool dump_native_stack,
                       BacktraceMap* backtrace_map,
                       bool force_dump_stack) const {
  ...
  if (safe_to_dump || force_dump_stack) {
    ...
    //主要看java堆栈逻辑
    DumpJavaStack(os,
                  /*check_suspended=*/ !force_dump_stack,
                  /*dump_locks=*/ !force_dump_stack);
  } else {
    ...
  }
}

void Thread::DumpJavaStack(std::ostream& os, bool check_suspended, bool dump_locks) const {
  ...

  std::unique_ptr<Context> context(Context::Create());
  StackDumpVisitor dumper(os, const_cast<Thread*>(this), context.get(),
                          !tls32_.throwing_OutOfMemoryError, check_suspended, dump_locks);
  dumper.WalkStack();
}

//dumper.WalkStack();是StackDumpVisitor的父类的父类StackVisitor中声明的方法
//art/runtime/stack.h中声明art/runtime/stack.cc中实现
template <StackVisitor::CountTransitions kCount>
void StackVisitor::WalkStack(bool include_transitions) {
  ...
  //逻辑比较复杂,我也觉得没必要每行代码都看懂,其实我们只要看看StackDumpVisitor实现的几个方法就知道打印了一些什么东西了,只要知道这个方法中遍历了java堆栈的帧
  for (const ManagedStack* current_fragment = thread_->GetManagedStack();
       current_fragment != nullptr; current_fragment = current_fragment->GetLink()) {
    ...
  }
}

//StackDumpVisitor对象的实现中可以看到我们常常在trace中看到的持锁,堆栈等打印
struct StackDumpVisitor : public MonitorObjectsStackVisitor {
  ...//省却构造器代码

  virtual ~StackDumpVisitor() {
    if (frame_count == 0) {
      //如果java堆栈帧数量为0就打印下面这句,经常可以在trace中看到
      os << "  (no managed stack frames)\n";
    }
  }

  static constexpr size_t kMaxRepetition = 3u;

  VisitMethodResult StartMethod(ArtMethod* m, size_t frame_nr ATTRIBUTE_UNUSED)
      override
      REQUIRES_SHARED(Locks::mutator_lock_) {
    m = m->GetInterfaceMethodIfProxy(kRuntimePointerSize);
    ObjPtr<mirror::DexCache> dex_cache = m->GetDexCache();
    int line_number = -1;
    if (dex_cache != nullptr) {  // be tolerant of bad input
      const DexFile* dex_file = dex_cache->GetDexFile();
      line_number = annotations::GetLineNumFromPC(dex_file, m, GetDexPc(false));
    }
    if (line_number == last_line_number && last_method == m) {
      ++repetition_count;
    } else {
      if (repetition_count >= kMaxRepetition) {
        os << "  ... repeated " << (repetition_count - kMaxRepetition) << " times\n";
      }
      repetition_count = 0;
      last_line_number = line_number;
      last_method = m;
    }

    if (repetition_count >= kMaxRepetition) {
      // Skip visiting=printing anything.
      return VisitMethodResult::kSkipMethod;
    }

    os << "  at " << m->PrettyMethod(false);
    if (m->IsNative()) {
      os << "(Native method)";
    } else {
      const char* source_file(m->GetDeclaringClassSourceFile());
      os << "(" << (source_file != nullptr ? source_file : "unavailable")
                       << ":" << line_number << ")";
    }
    os << "\n";
    // Go and visit locks.
    return VisitMethodResult::kContinueMethod;
  }

  VisitMethodResult EndMethod(ArtMethod* m ATTRIBUTE_UNUSED) override {
    return VisitMethodResult::kContinueMethod;
  }

  //waiting on对象的打印代码
  void VisitWaitingObject(ObjPtr<mirror::Object> obj, ThreadState state ATTRIBUTE_UNUSED)
      override
      REQUIRES_SHARED(Locks::mutator_lock_) {
    PrintObject(obj, "  - waiting on ", ThreadList::kInvalidThreadId);
  }
  
  //sleeping on对象的打印代码
  void VisitSleepingObject(ObjPtr<mirror::Object> obj)
      override
      REQUIRES_SHARED(Locks::mutator_lock_) {
    PrintObject(obj, "  - sleeping on ", ThreadList::kInvalidThreadId);
  }
  //等锁打印
  void VisitBlockedOnObject(ObjPtr<mirror::Object> obj,
                            ThreadState state,
                            uint32_t owner_tid)
      override
      REQUIRES_SHARED(Locks::mutator_lock_) {
    const char* msg;
    switch (state) {
      case kBlocked:
        msg = "  - waiting to lock ";
        break;

      case kWaitingForLockInflation:
        msg = "  - waiting for lock inflation of ";
        break;

      default:
        LOG(FATAL) << "Unreachable";
        UNREACHABLE();
    }
    PrintObject(obj, msg, owner_tid);
  }
  //持锁打印
  void VisitLockedObject(ObjPtr<mirror::Object> obj)
      override
      REQUIRES_SHARED(Locks::mutator_lock_) {
    PrintObject(obj, "  - locked ", ThreadList::kInvalidThreadId);
  }

  void PrintObject(ObjPtr<mirror::Object> obj,
                   const char* msg,
                   uint32_t owner_tid) REQUIRES_SHARED(Locks::mutator_lock_) {
    if (obj == nullptr) {
      os << msg << "an unknown object";
    } else {
      if ((obj->GetLockWord(true).GetState() == LockWord::kThinLocked) &&
          Locks::mutator_lock_->IsExclusiveHeld(Thread::Current())) {
        // Getting the identity hashcode here would result in lock inflation and suspension of the
        // current thread, which isn't safe if this is the only runnable thread.
        os << msg << StringPrintf("<@addr=0x%" PRIxPTR "> (a %s)",
                                  reinterpret_cast<intptr_t>(obj.Ptr()),
                                  obj->PrettyTypeOf().c_str());
      } else {
        // - waiting on <0x6008c468> (a java.lang.Class<java.lang.ref.ReferenceQueue>)
        // Call PrettyTypeOf before IdentityHashCode since IdentityHashCode can cause thread
        // suspension and move pretty_object.
        const std::string pretty_type(obj->PrettyTypeOf());
        os << msg << StringPrintf("<0x%08x> (a %s)", obj->IdentityHashCode(), pretty_type.c_str());
      }
    }
    if (owner_tid != ThreadList::kInvalidThreadId) {
      os << " held by thread " << owner_tid;
    }
    os << "\n";
  }

  ...
};

堆栈的打印流程基本分析完了,也知道无论是java进程接收kill -3或者native进程接收kill -35信号所产生的backtrace都会通过socket写到tombstoned方。在watchdog抓trace过程中,systemserver进程通过调用debuggerd_trigger_dump,发送kill-3或者kill-35的时候,通过socket发送给tombstoned了一个拦截请求:InterceptRequest req = { .dump_type = dump_type, .pid = pid, };。那接下来就看看tombstoned这边发生了什么

3 tombstoned流程

tombstoned从名称看来知道他是一个daemon进程,常驻在内存中,在initrc中启动,该服务中有3个localsocket的声明

//system/core/debuggerd/tombstoned/tombstoned.rc
service tombstoned /system/bin/tombstoned
    user tombstoned
    group system

    socket tombstoned_crash seqpacket 0666 system system
    socket tombstoned_intercept seqpacket 0666 system system
    socket tombstoned_java_trace seqpacket 0666 system system
    writepid /dev/cpuset/system-background/tasks

on post-fs-data
    start tombstoned

tombstoned主要流程

//system/core/debuggerd/tombstoned/tombstoned.cpp
int main(int, char* []) {
  ...

  // Don't try to connect to ourselves if we crash.
  ...

  int intercept_socket = android_get_control_socket(kTombstonedInterceptSocketName);
  int crash_socket = android_get_control_socket(kTombstonedCrashSocketName);

  ...

  intercept_manager = new InterceptManager(base, intercept_socket);

  evconnlistener* tombstone_listener =
      evconnlistener_new(base, crash_accept_cb, CrashQueue::for_tombstones(), LEV_OPT_CLOSE_ON_FREE,
                         -1 /* backlog */, crash_socket);
  ...

  if (kJavaTraceDumpsEnabled) {
    const int java_trace_socket = android_get_control_socket(kTombstonedJavaTraceSocketName);
    ...
    evconnlistener* java_trace_listener =
        evconnlistener_new(base, crash_accept_cb, CrashQueue::for_anrs(), LEV_OPT_CLOSE_ON_FREE,
                           -1 /* backlog */, java_trace_socket);
    ...
  }

  LOG(INFO) << "tombstoned successfully initialized";
  event_base_dispatch(base);
}

创建了3个socket服务,非阻塞监听,使用libevent库处理请求,设置socket回调为crash_accept_cb,新建了一个拦截管理器InterceptManager

//system/core/debuggerd/tombstoned/intercept_manager.h
struct InterceptManager {
  ...
  std::unordered_map<pid_t, std::unique_ptr<Intercept>> intercepts;
  ...
};

struct Intercept {
  ...
  android::base::unique_fd sockfd;
  android::base::unique_fd output_fd;
  DebuggerdDumpType dump_type = kDebuggerdNativeBacktrace;
  ...
};

拦截管理器中有个map存储了以pid作为key,拦截对象Intercept作为value的映射关系,dumpbacktrace发起方发送过来的拦截请求就存在这个map里面,拦截对象Intercept有socketfd和output_fd。当然output_fd就是用来写入backtrace内容到发起端的管道fd。这个output_fd是在dumpbacktrace发起方进程中生成的管道fd,通过SendFileDescriptors实现跨进程的fd传输。这个fd在socket两端进程都是有效的fd,可以直接用来读写,也可以再次发送给其他进程!
InterceptManager有个GetIntercept方法,这个GetIntercept的实现通过pid和dumptype找到这个output_fd.

bool InterceptManager::GetIntercept(pid_t pid, DebuggerdDumpType dump_type,
                                    android::base::unique_fd* out_fd) {
  ...//省略从intercepts的map中查询pid的拦截对象,取出output_fd并且dup一次给GetIntercept的调用方使用
  InterceptResponse response = {};
  response.status = InterceptStatus::kStarted;
  TEMP_FAILURE_RETRY(write(intercept->sockfd, &response, sizeof(response)));
  *out_fd = std::move(intercept->output_fd);

  return true;
}

当crashdump或者java进程抓取完log后,会调用tombstoned_connect(g_target_thread, &g_tombstoned_socket, &g_output_fd, dump_type);或者tombstoned_connect(getpid(), &tombstone_fd, &output_fd, kDebuggerdJavaBacktrace)前面并未具体分析这个tombstoned_connect的实现,只是笼统的说把trace写入g_output_fd或者output_fd中,理解为写给tombstoned。现在我们来看看这个output_fd是怎么产生的

//system/core/debuggerd/tombstoned/tombstoned_client.cpp
bool tombstoned_connect(pid_t pid, unique_fd* tombstoned_socket, unique_fd* output_fd,
                        DebuggerdDumpType dump_type) {
  //连接tombstoned的socket名称为"tombstoned_crash"或者“tombstoned_java_trace”
  unique_fd sockfd(
      socket_local_client((dump_type != kDebuggerdJavaBacktrace ? kTombstonedCrashSocketName
                                                                : kTombstonedJavaTraceSocketName),
                          ANDROID_SOCKET_NAMESPACE_RESERVED, SOCK_SEQPACKET));
  ...
  //发生请求的数据包
  TombstonedCrashPacket packet = {};
  packet.packet_type = CrashPacketType::kDumpRequest;
  packet.packet.dump_request.pid = pid;
  packet.packet.dump_request.dump_type = dump_type;
  //发生请求
  if (TEMP_FAILURE_RETRY(write(sockfd, &packet, sizeof(packet))) != sizeof(packet)) {
    async_safe_format_log(ANDROID_LOG_ERROR, "libc", "failed to write DumpRequest packet: %s",
                          strerror(errno));
    return false;
  }

  unique_fd tmp_output_fd;
  //接收对方tombstoned进程通过SendFileDescriptors发送过来的fd
  ssize_t rc = ReceiveFileDescriptors(sockfd, &packet, sizeof(packet), &tmp_output_fd);
  ...
  //dup一次写到引用参数中
  *output_fd = std::move(tmp_output_fd);
  return true;
}

可以看到output_fd是tombstoned进程发送过来的,实现代码为

//system/core/debuggerd/tombstoned/tombstoned.cpp
static void perform_request(Crash* crash) {
  unique_fd output_fd;
  //查询并获取dumpbacktrace发起方发送过来的管道fd
  bool intercepted =
      intercept_manager->GetIntercept(crash->crash_pid, crash->crash_type, &output_fd);
  ...

  TombstonedCrashPacket response = {
    .packet_type = CrashPacketType::kPerformDump
  };
  //转手就把管道fd发送给 crashdump或者java进程
  ssize_t rc =
      SendFileDescriptors(crash->crash_socket_fd, &response, sizeof(response), output_fd.get());
  output_fd.reset();

  ...
}

所以无论是crashdump还是java进程在把抓取的backtrace写入output_fd时其实是直接写给dumpbacktrace发起端(watchdog抓trace时发起端是systemserver进程)创建的管道fd(未命名管道对创建时同时生成读、写端2个fd)。所以发起方一起创建的读取端管道fd直接能获取到这些backtracelog。

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

相关阅读更多精彩内容

友情链接更多精彩内容