watchdog是什么
Watchdog是SystemServer的一个线程(mThread = new Thread(this::run, "watchdog");),检测system server重要线程的锁状态和Handler消息是否阻塞,假如有线程block了60s那么就会触发watchdog timeout flow, 触发Android重启来使系统恢复;block 30s会有线程堆栈log打印,并触发/data/anr/trace.txt文件生成。
watchdog怎么用
以AMS为例
public class ActivityManagerService extends IActivityManager.Stub
implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback, ActivityManagerGlobalLock {
public ActivityManagerService(Context systemContext, ActivityTaskManagerService atm) {
Watchdog.getInstance().addMonitor(this);
Watchdog.getInstance().addThread(mHandler);
}
public void monitor() {
synchronized (this) { }
}
watchdog源码分析
com/android/server/Watchdog.java
private final HandlerChecker mMonitorChecker; //专门用于监控是否死锁
public void addMonitor(Monitor monitor) {
synchronized (mLock) {
mMonitorChecker.addMonitorLocked(monitor);
}
}
/* This handler will be used to post message back onto the main thread */
private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>();//需要监控的looper
public void addThread(Handler thread) {
synchronized (mLock) {
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
}
}
private Watchdog() {
mThread = new Thread(this::run, "watchdog");//本质上是一个线程
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
//
// The shared foreground thread is the main checker. It is where we
// will also dispatch monitor checks and do other work.
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread");
mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
// Add checker for shared UI thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(UiThread.getHandler(), "ui thread")));
// And also check IO thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(IoThread.getHandler(), "i/o thread")));
// And the display thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(DisplayThread.getHandler(), "display thread")));
// And the animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
// And the surface animation thread.
mHandlerCheckers.add(withDefaultTimeout(
new HandlerChecker(SurfaceAnimationThread.getHandler(),
"surface animation thread")));
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());//检测是否binder线程耗尽
mInterestingJavaPids.add(Process.myPid());
// See the notes on DEFAULT_TIMEOUT.
assert DB ||
DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
mTraceErrorLogger = new TraceErrorLogger();
}
public void start() {
mThread.start();
}
private void run() {
boolean waitedHalf = false;
while (true) {
List<HandlerChecker> blockedCheckers = Collections.emptyList();
String subject = "";
boolean allowRestart = true;
int debuggerWasConnected = 0;
boolean doWaitedHalfDump = false;
// The value of mWatchdogTimeoutMillis might change while we are executing the loop.
// We store the current value to use a consistent value for all handlers.
final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
final long checkIntervalMillis = watchdogTimeoutMillis / 2;
final ArrayList<Integer> pids;
synchronized (mLock) {
long timeout = checkIntervalMillis;
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
// We pick the watchdog to apply every time we reschedule the checkers. The
// default timeout might have changed since the last run.
hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
.orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));//往被监测的looper线程发消息
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
mLock.wait(timeout);//等30s
// Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
}
final int waitState = evaluateCheckerCompletionLocked();//判断是否有阻塞
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
if (!waitedHalf) {
Slog.i(TAG, "WAITED_HALF");
waitedHalf = true;
// We've waited half, but we'd need to do the stack trace dump w/o the lock.
blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);//找到被阻塞的looper
subject = describeCheckersLocked(blockedCheckers);
pids = new ArrayList<>(mInterestingJavaPids);
doWaitedHalfDump = true;
} else {
continue;
}
} else {
// something is overdue!
blockedCheckers = getCheckersWithStateLocked(OVERDUE);
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
pids = new ArrayList<>(mInterestingJavaPids);
}
} // END synchronized (mLock)
// If we got here, that means that the system is most likely hung.
//
// First collect stack traces from all threads of the system process.
//
// Then, if we reached the full timeout, kill this process so that the system will
// restart. If we reached half of the timeout, just log some information and continue.
logWatchog(doWaitedHalfDump, subject, pids);//打日志,触发生成trace.txt
if (doWaitedHalfDump) {
// We have waited for only half of the timeout, we continue to wait for the duration
// of the full timeout before killing the process.
continue;
}
IActivityController controller;
synchronized (mLock) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {//判断当前是否处于debugger调试,排除debugger调试引起的超时
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
Slog.w(TAG, "*** GOODBYE!");
if (!Build.IS_USER && isCrashLoopFound()
&& !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
breakCrashLoop();
}
Process.killProcess(Process.myPid());//重启system_server
System.exit(10);
}
waitedHalf = false;
}
}
public final class HandlerChecker implements Runnable {
public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
mWaitMax = handlerCheckerTimeoutMillis;
if (mCompleted) {
// Safe to update monitors in queue, Handler is not in the middle of work
mMonitors.addAll(mMonitorQueue);
mMonitorQueue.clear();
}
if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
|| (mPauseCount > 0)) {
// Don't schedule until after resume OR
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if we have no monitors since those would need to
// be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
mHandler.postAtFrontOfQueue(this);//发消息
}
public int getCompletionStateLocked() {//判断是否阻塞
if (mCompleted) {
return COMPLETED;
} else {
long latency = SystemClock.uptimeMillis() - mStartTime;
if (latency < mWaitMax/2) {
return WAITING;
} else if (latency < mWaitMax) {
return WAITED_HALF;
}
}
return OVERDUE;
}
public void run() {
// Once we get here, we ensure that mMonitors does not change even if we call
// #addMonitorLocked because we first add the new monitors to mMonitorQueue and
// move them to mMonitors on the next schedule when mCompleted is true, at which
// point we have completed execution of this method.
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (mLock) {
mCurrentMonitor = mMonitors.get(i);
}
mCurrentMonitor.monitor();//判断是否死锁
}
synchronized (mLock) {
mCompleted = true;
mCurrentMonitor = null;
}
}
}
总结:
Watchdog的主要流程是:开启一个死循环,不断给指定线程发送一条消息,然后休眠30秒,休眠结束后判断是否收到消息的回调,如果有,则正常进行下次循环,如果没收到,判断从发消息到现在的时机小于30秒不处理,大于30秒小于60秒收集信息,大于60秒收集信息并重启。
问题分析解决思路
跟anr解决的思路一致,看log,看trace.txt,注意cpu、io、锁、binder调用甚至是Thread.sleep等常见的因素。可以参考https://blog.csdn.net/wd229047557/article/details/108059481