本节简单介绍了PostgreSQL的后台进程:autovacuum,主要分析了AutoVacLauncherMain函数的实现逻辑。
一、数据结构
宏定义
#define GetProcessingMode() Mode
#define SetProcessingMode(mode) \
do { \
AssertArg((mode) == BootstrapProcessing || \
(mode) == InitProcessing || \
(mode) == NormalProcessing); \
Mode = (mode); \
} while(0)
二、源码解读
AutoVacLauncherMain函数,autovacuum进程主循环.
/*
* Main loop for the autovacuum launcher process.
* autovacuum进程主循环
*/
NON_EXEC_STATIC void
AutoVacLauncherMain(int argc, char *argv[])
{
sigjmp_buf local_sigjmp_buf;
am_autovacuum_launcher = true;
/* Identify myself via ps */
//进程ID
init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_LAUNCHER), "", "", "");
ereport(DEBUG1,
(errmsg("autovacuum launcher started")));
if (PostAuthDelay)
pg_usleep(PostAuthDelay * 1000000L);
//设置进程模式
SetProcessingMode(InitProcessing);
/*
* Set up signal handlers. We operate on databases much like a regular
* backend, so we use the same signal handling. See equivalent code in
* tcop/postgres.c.
* 设置信号控制器.
* autovacuum的执行类似于普通的后台进程,因此使用相同的信号控制机制.
* 参考tcop/postgres.c中的代码.
*/
pqsignal(SIGHUP, av_sighup_handler);
pqsignal(SIGINT, StatementCancelHandler);
pqsignal(SIGTERM, avl_sigterm_handler);
pqsignal(SIGQUIT, quickdie);
//建立SIGALRM控制器
InitializeTimeouts(); /* establishes SIGALRM handler */
pqsignal(SIGPIPE, SIG_IGN);//忽略SIGPIPE
pqsignal(SIGUSR1, procsignal_sigusr1_handler);
pqsignal(SIGUSR2, avl_sigusr2_handler);
pqsignal(SIGFPE, FloatExceptionHandler);
pqsignal(SIGCHLD, SIG_DFL);
/* Early initialization */
//基础初始化
BaseInit();
/*
* Create a per-backend PGPROC struct in shared memory, except in the
* EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
* this before we can use LWLocks (and in the EXEC_BACKEND case we already
* had to do some stuff with LWLocks).
* 在共享内存中创建每个后台进程的PGPROC结构体,
* 但除了EXEC_BACKEND这种情况,这是在SubPostmasterMain中完成的。
*/
#ifndef EXEC_BACKEND
InitProcess();
#endif
//初始化
InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
//设置进程模式
SetProcessingMode(NormalProcessing);
/*
* Create a memory context that we will do all our work in. We do this so
* that we can reset the context during error recovery and thereby avoid
* possible memory leaks.
* 创建内存上下文.
* 之所以这样做是因为我们可以在错误恢复中重置上下文,并且可以避免内存泄漏.
*/
AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
"Autovacuum Launcher",
ALLOCSET_DEFAULT_SIZES);
MemoryContextSwitchTo(AutovacMemCxt);
/*
* If an exception is encountered, processing resumes here.
* 如果出现异常,在这里重新恢复.
*
* This code is a stripped down version of PostgresMain error recovery.
* 这段代码是PostgresMain错误恢复的精简版。
*/
if (sigsetjmp(local_sigjmp_buf, 1) != 0)
{
/* since not using PG_TRY, must reset error stack by hand */
//由于没有使用PG_TRY,这里必须手工重置错误.
error_context_stack = NULL;
/* Prevents interrupts while cleaning up */
//在清理期间禁用中断
HOLD_INTERRUPTS();
/* Forget any pending QueryCancel or timeout request */
//忽略所有QueryCancel或者超时请求
disable_all_timeouts(false);
QueryCancelPending = false; /* second to avoid race condition */
/* Report the error to the server log */
//在服务器日志中记录日志.
EmitErrorReport();
/* Abort the current transaction in order to recover */
//废弃当前事务,以准备恢复
AbortCurrentTransaction();
/*
* Release any other resources, for the case where we were not in a
* transaction.
* 释放所有其他资源,以防我们不在事务中。
*/
LWLockReleaseAll();
pgstat_report_wait_end();
AbortBufferIO();
UnlockBuffers();
/* this is probably dead code, but let's be safe: */
//这可能是dead code,但可以保证安全
if (AuxProcessResourceOwner)
ReleaseAuxProcessResources(false);
AtEOXact_Buffers(false);
AtEOXact_SMgr();
AtEOXact_Files(false);
AtEOXact_HashTables(false);
/*
* Now return to normal top-level context and clear ErrorContext for
* next time.
* 现在切换回正常的顶层上下文中,并为下一次的启动清理错误上下文
*/
MemoryContextSwitchTo(AutovacMemCxt);
FlushErrorState();
/* Flush any leaked data in the top-level context */
//在top-level上下文刷新所有泄漏的数据
MemoryContextResetAndDeleteChildren(AutovacMemCxt);
/* don't leave dangling pointers to freed memory */
//不要留下悬空指针来释放内存
DatabaseListCxt = NULL;
dlist_init(&DatabaseList);
/*
* Make sure pgstat also considers our stat data as gone. Note: we
* mustn't use autovac_refresh_stats here.
* 确保pgstat也认为我们的统计数据已经丢弃。
* 注意:这里不能使用autovac_refresh_stats。
*/
pgstat_clear_snapshot();
/* Now we can allow interrupts again */
//可以允许中断了
RESUME_INTERRUPTS();
/* if in shutdown mode, no need for anything further; just go away */
//如处于shutdown模式,不需要继续后续的工作了,跳转到shutdown
if (got_SIGTERM)
goto shutdown;
/*
* Sleep at least 1 second after any error. We don't want to be
* filling the error logs as fast as we can.
*/
pg_usleep(1000000L);
}
/* We can now handle ereport(ERROR) */
//现在可以处理ereport(ERROR)了
PG_exception_stack = &local_sigjmp_buf;
/* must unblock signals before calling rebuild_database_list */
//在调用rebuild_database_list前不能阻塞信号
PG_SETMASK(&UnBlockSig);
/*
* Set always-secure search path. Launcher doesn't connect to a database,
* so this has no effect.
* 设置安全的搜索路径.
* Launcher不能连接数据库,因此并没有什么影响.
*/
SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
/*
* Force zero_damaged_pages OFF in the autovac process, even if it is set
* in postgresql.conf. We don't really want such a dangerous option being
* applied non-interactively.
* 在autovacuum进程中,强制关闭zero_damaged_pages,即时该参数在配置文件设置为ON.
* 我们真的不希望这样一个危险的选项在无需交互的情况进行应用.
*/
SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
/*
* Force settable timeouts off to avoid letting these settings prevent
* regular maintenance from being executed.
* 强制关闭可设置的超时,以避免这些设置妨碍常规维护的执行。
*/
SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
SetConfigOption("idle_in_transaction_session_timeout", "0",
PGC_SUSET, PGC_S_OVERRIDE);
/*
* Force default_transaction_isolation to READ COMMITTED. We don't want
* to pay the overhead of serializable mode, nor add any risk of causing
* deadlocks or delaying other transactions.
* 强制default_transaction_isolation为READ COMMITTED.
* 我们不希望在serializable模式下增加负担,也不想增加导致死锁或者其他事务延迟的风险.
*/
SetConfigOption("default_transaction_isolation", "read committed",
PGC_SUSET, PGC_S_OVERRIDE);
/*
* In emergency mode, just start a worker (unless shutdown was requested)
* and go away.
* 在紧急模式,启动一个worker(除非shutdown请求)
*/
if (!AutoVacuumingActive())
{
if (!got_SIGTERM)
do_start_worker();
proc_exit(0); /* done */
}
AutoVacuumShmem->av_launcherpid = MyProcPid;
/*
* Create the initial database list. The invariant we want this list to
* keep is that it's ordered by decreasing next_time. As soon as an entry
* is updated to a higher time, it will be moved to the front (which is
* correct because the only operation is to add autovacuum_naptime to the
* entry, and time always increases).
* 创建初始化数据库链表.
* 我们希望这个链表保持不变的是它是通过减少next_time来进行排序.
* 一旦条目更新到更高的时间,它就会被移动到前面
* (这样处理没有问题,因为惟一的操作是向条目添加autovacuum_naptime,而时间总是会增加)。
*/
rebuild_database_list(InvalidOid);
/* loop until shutdown request */
//循环,直至请求shutdown
while (!got_SIGTERM)
{
struct timeval nap;
TimestampTz current_time = 0;
bool can_launch;
/*
* This loop is a bit different from the normal use of WaitLatch,
* because we'd like to sleep before the first launch of a child
* process. So it's WaitLatch, then ResetLatch, then check for
* wakening conditions.
* 该循环与常规的使用WaitLatch不同,因为我们希望在第一个子进程启动前处于休眠状态.
* 因此首先WaitLatch,接着ResetLatch,然后检查并等待唤醒条件.
*/
launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
false, &nap);
/*
* Wait until naptime expires or we get some type of signal (all the
* signal handlers will wake us by calling SetLatch).
* 等待,直至naptime超时或者我们接收到某些类型的信号.
* (所有的信号控制器会通过调用SetLatch唤醒进程)
*/
(void) WaitLatch(MyLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
(nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
WAIT_EVENT_AUTOVACUUM_MAIN);
ResetLatch(MyLatch);
/* Process sinval catchup interrupts that happened while sleeping */
//在休眠过程中,进程会捕获相关的中断.
ProcessCatchupInterrupt();
/* the normal shutdown case */
//shutdonw信号
if (got_SIGTERM)
break;
if (got_SIGHUP)
{
//SIGHUP信号
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
/* shutdown requested in config file? */
//在配置文件中已请求shutdown?
//已处于autovacuum中
if (!AutoVacuumingActive())
break;
/* rebalance in case the default cost parameters changed */
//如默认的成本参数变化,则自动平衡.
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
autovac_balance_cost();
LWLockRelease(AutovacuumLock);
/* rebuild the list in case the naptime changed */
//如naptime出现变化,重建链表
rebuild_database_list(InvalidOid);
}
/*
* a worker finished, or postmaster signalled failure to start a
* worker
* 某个worker已完成,或者postmaster信号出现异常无法启动worker
*/
if (got_SIGUSR2)
{
//SIGUSR2信号
got_SIGUSR2 = false;
/* rebalance cost limits, if needed */
//如需要,重平衡成本限制
if (AutoVacuumShmem->av_signal[AutoVacRebalance])
{
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
autovac_balance_cost();
LWLockRelease(AutovacuumLock);
}
if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
{
/*
* If the postmaster failed to start a new worker, we sleep
* for a little while and resend the signal. The new worker's
* state is still in memory, so this is sufficient. After
* that, we restart the main loop.
* 如果postmaster无法启动新的worker,休眠一段时间,重新发送信号.
* 新的worker的状态仍然在内存中,因此这样处理是OK的.
* 再次之后,重新启动主循环.
*
* XXX should we put a limit to the number of times we retry?
* I don't think it makes much sense, because a future start
* of a worker will continue to fail in the same way.
* 是否增加重试次数的限制?XXX
* 我们不想太过敏感,因为某个worker在未来的启动会以同样的方式持续失败.
*/
AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
pg_usleep(1000000L); /* 1s */
SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
continue;
}
}
/*
* There are some conditions that we need to check before trying to
* start a worker. First, we need to make sure that there is a worker
* slot available. Second, we need to make sure that no other worker
* failed while starting up.
* 在尝试启动worker前,有一些条件需要检查.
* 首先,需要确保有可用的worker slot;其次,需要确保worker在启动时没有出现异常.
*/
current_time = GetCurrentTimestamp();
LWLockAcquire(AutovacuumLock, LW_SHARED);
can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
if (AutoVacuumShmem->av_startingWorker != NULL)
{
int waittime;
WorkerInfo worker = AutoVacuumShmem->av_startingWorker;
/*
* We can't launch another worker when another one is still
* starting up (or failed while doing so), so just sleep for a bit
* more; that worker will wake us up again as soon as it's ready.
* We will only wait autovacuum_naptime seconds (up to a maximum
* of 60 seconds) for this to happen however. Note that failure
* to connect to a particular database is not a problem here,
* because the worker removes itself from the startingWorker
* pointer before trying to connect. Problems detected by the
* postmaster (like fork() failure) are also reported and handled
* differently. The only problems that may cause this code to
* fire are errors in the earlier sections of AutoVacWorkerMain,
* before the worker removes the WorkerInfo from the
* startingWorker pointer.
* 在某个worker仍然在启动时,不能启动新的worker,因此休眠一段时间;
* 另外一个worker在ready后会第一时间唤醒我们.
* 只需要等待autovacuum_naptime参数设置的时间(单位秒)(最大为60s).
* 注意,在这里不能够连接一个特定的数据库不存在任何问题,因为worker在
* 尝试连接时,通过startingWorker指针销毁自己.
* 通过postmaster检测到问题(如fork()失败)会报告并且进行不同的处理,
* 这里唯一的问题是可能导致这里的处理逻辑在AutoVacWorkerMain的早起触发错误,
* 而且是在worker通过startingWorker指针清除WorkerInfo前.
*/
waittime = Min(autovacuum_naptime, 60) * 1000;
if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
waittime))
{
LWLockRelease(AutovacuumLock);
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
/*
* No other process can put a worker in starting mode, so if
* startingWorker is still INVALID after exchanging our lock,
* we assume it's the same one we saw above (so we don't
* recheck the launch time).
*/
if (AutoVacuumShmem->av_startingWorker != NULL)
{
worker = AutoVacuumShmem->av_startingWorker;
worker->wi_dboid = InvalidOid;
worker->wi_tableoid = InvalidOid;
worker->wi_sharedrel = false;
worker->wi_proc = NULL;
worker->wi_launchtime = 0;
dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
&worker->wi_links);
AutoVacuumShmem->av_startingWorker = NULL;
elog(WARNING, "worker took too long to start; canceled");
}
}
else
can_launch = false;
}
//释放锁
LWLockRelease(AutovacuumLock); /* either shared or exclusive */
/* if we can't do anything, just go back to sleep */
//什么都做不了,继续休眠
if (!can_launch)
continue;
/* We're OK to start a new worker */
//现在可以启动新的worker
if (dlist_is_empty(&DatabaseList))
{
/*
* Special case when the list is empty: start a worker right away.
* This covers the initial case, when no database is in pgstats
* (thus the list is empty). Note that the constraints in
* launcher_determine_sleep keep us from starting workers too
* quickly (at most once every autovacuum_naptime when the list is
* empty).
* 在链表为空时的特殊情况:正确的启动一个worker.
* 这涵盖了刚初始的情况,即pgstats中没有数据库(因此链表为空)。
* 请注意,launcher_determine_sleep中的约束使我们不能过快地启动worker
* (当链表为空时,最多一次autovacuum_naptime)。
*/
launch_worker(current_time);
}
else
{
/*
* because rebuild_database_list constructs a list with most
* distant adl_next_worker first, we obtain our database from the
* tail of the list.
* 因为rebuild_database_list首先用最远的adl_next_worker构造了链表,
* 所以我们从链表的尾部获取数据库。
*/
avl_dbase *avdb;
avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
/*
* launch a worker if next_worker is right now or it is in the
* past
* 启动worker:如已超过worker的启动时间.
*/
if (TimestampDifferenceExceeds(avdb->adl_next_worker,
current_time, 0))
launch_worker(current_time);
}
}
/* Normal exit from the autovac launcher is here */
//常规的退出.
shutdown:
ereport(DEBUG1,
(errmsg("autovacuum launcher shutting down")));
AutoVacuumShmem->av_launcherpid = 0;
proc_exit(0); /* done */
}
/*
* TimestampDifferenceExceeds -- report whether the difference between two
* timestamps is >= a threshold (expressed in milliseconds)
*
* Both inputs must be ordinary finite timestamps (in current usage,
* they'll be results from GetCurrentTimestamp()).
*/
bool
TimestampDifferenceExceeds(TimestampTz start_time,
TimestampTz stop_time,
int msec)
{
TimestampTz diff = stop_time - start_time;
return (diff >= msec * INT64CONST(1000));
}
/*
* Return the address of the last element in the list.
*
* The list must not be empty.
*/
#define dlist_tail_element(type, membername, lhead) \
(AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node), \
((type *) dlist_tail_element_off(lhead, offsetof(type, membername))))
/* internal support function to get address of tail element's struct */
static inline void *
dlist_tail_element_off(dlist_head *head, size_t off)
{
Assert(!dlist_is_empty(head));
return (char *) head->head.prev - off;
}
三、跟踪分析
N/A