PostgreSQL 源码解读（124）- 后台进程#4（autovacuum进程#1）

本节简单介绍了PostgreSQL的后台进程:autovacuum,主要分析了AutoVacLauncherMain函数的实现逻辑。

一、数据结构

宏定义

#define GetProcessingMode() Mode

#define SetProcessingMode(mode) \
    do { \
        AssertArg((mode) == BootstrapProcessing || \
                  (mode) == InitProcessing || \
                  (mode) == NormalProcessing); \
        Mode = (mode); \
    } while(0)

二、源码解读

AutoVacLauncherMain函数,autovacuum进程主循环.

/*
 * Main loop for the autovacuum launcher process.
 * autovacuum进程主循环
 */
NON_EXEC_STATIC void
AutoVacLauncherMain(int argc, char *argv[])
{
    sigjmp_buf  local_sigjmp_buf;

    am_autovacuum_launcher = true;

    /* Identify myself via ps */
    //进程ID
    init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_LAUNCHER), "", "", "");

    ereport(DEBUG1,
            (errmsg("autovacuum launcher started")));

    if (PostAuthDelay)
        pg_usleep(PostAuthDelay * 1000000L);
    //设置进程模式
    SetProcessingMode(InitProcessing);

    /*
     * Set up signal handlers.  We operate on databases much like a regular
     * backend, so we use the same signal handling.  See equivalent code in
     * tcop/postgres.c.
     * 设置信号控制器.
     * autovacuum的执行类似于普通的后台进程,因此使用相同的信号控制机制.
     * 参考tcop/postgres.c中的代码.
     */
    pqsignal(SIGHUP, av_sighup_handler);
    pqsignal(SIGINT, StatementCancelHandler);
    pqsignal(SIGTERM, avl_sigterm_handler);

    pqsignal(SIGQUIT, quickdie);
    //建立SIGALRM控制器
    InitializeTimeouts();       /* establishes SIGALRM handler */

    pqsignal(SIGPIPE, SIG_IGN);//忽略SIGPIPE
    pqsignal(SIGUSR1, procsignal_sigusr1_handler);
    pqsignal(SIGUSR2, avl_sigusr2_handler);
    pqsignal(SIGFPE, FloatExceptionHandler);
    pqsignal(SIGCHLD, SIG_DFL);

    /* Early initialization */
    //基础初始化
    BaseInit();

    /*
     * Create a per-backend PGPROC struct in shared memory, except in the
     * EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
     * this before we can use LWLocks (and in the EXEC_BACKEND case we already
     * had to do some stuff with LWLocks).
     * 在共享内存中创建每个后台进程的PGPROC结构体，
     *   但除了EXEC_BACKEND这种情况，这是在SubPostmasterMain中完成的。
     */
#ifndef EXEC_BACKEND
    InitProcess();
#endif
    //初始化
    InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
    //设置进程模式
    SetProcessingMode(NormalProcessing);

    /*
     * Create a memory context that we will do all our work in.  We do this so
     * that we can reset the context during error recovery and thereby avoid
     * possible memory leaks.
     * 创建内存上下文.
     * 之所以这样做是因为我们可以在错误恢复中重置上下文,并且可以避免内存泄漏.
     */
    AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
                                          "Autovacuum Launcher",
                                          ALLOCSET_DEFAULT_SIZES);
    MemoryContextSwitchTo(AutovacMemCxt);

    /*
     * If an exception is encountered, processing resumes here.
     * 如果出现异常,在这里重新恢复.
     *
     * This code is a stripped down version of PostgresMain error recovery.
     * 这段代码是PostgresMain错误恢复的精简版。
     */
    if (sigsetjmp(local_sigjmp_buf, 1) != 0)
    {
        /* since not using PG_TRY, must reset error stack by hand */
        //由于没有使用PG_TRY,这里必须手工重置错误.
        error_context_stack = NULL;

        /* Prevents interrupts while cleaning up */
        //在清理期间禁用中断
        HOLD_INTERRUPTS();

        /* Forget any pending QueryCancel or timeout request */
        //忽略所有QueryCancel或者超时请求
        disable_all_timeouts(false);
        QueryCancelPending = false; /* second to avoid race condition */

        /* Report the error to the server log */
        //在服务器日志中记录日志.
        EmitErrorReport();

        /* Abort the current transaction in order to recover */
        //废弃当前事务,以准备恢复
        AbortCurrentTransaction();

        /*
         * Release any other resources, for the case where we were not in a
         * transaction.
         * 释放所有其他资源，以防我们不在事务中。
         */
        LWLockReleaseAll();
        pgstat_report_wait_end();
        AbortBufferIO();
        UnlockBuffers();
        /* this is probably dead code, but let's be safe: */
        //这可能是dead code,但可以保证安全
        if (AuxProcessResourceOwner)
            ReleaseAuxProcessResources(false);
        AtEOXact_Buffers(false);
        AtEOXact_SMgr();
        AtEOXact_Files(false);
        AtEOXact_HashTables(false);

        /*
         * Now return to normal top-level context and clear ErrorContext for
         * next time.
         * 现在切换回正常的顶层上下文中,并为下一次的启动清理错误上下文
         */
        MemoryContextSwitchTo(AutovacMemCxt);
        FlushErrorState();

        /* Flush any leaked data in the top-level context */
        //在top-level上下文刷新所有泄漏的数据
        MemoryContextResetAndDeleteChildren(AutovacMemCxt);

        /* don't leave dangling pointers to freed memory */
        //不要留下悬空指针来释放内存
        DatabaseListCxt = NULL;
        dlist_init(&DatabaseList);

        /*
         * Make sure pgstat also considers our stat data as gone.  Note: we
         * mustn't use autovac_refresh_stats here.
         * 确保pgstat也认为我们的统计数据已经丢弃。
         * 注意:这里不能使用autovac_refresh_stats。
         */
        pgstat_clear_snapshot();

        /* Now we can allow interrupts again */
        //可以允许中断了
        RESUME_INTERRUPTS();

        /* if in shutdown mode, no need for anything further; just go away */
        //如处于shutdown模式,不需要继续后续的工作了,跳转到shutdown
        if (got_SIGTERM)
            goto shutdown;

        /*
         * Sleep at least 1 second after any error.  We don't want to be
         * filling the error logs as fast as we can.
         */
        pg_usleep(1000000L);
    }

    /* We can now handle ereport(ERROR) */
    //现在可以处理ereport(ERROR)了
    PG_exception_stack = &local_sigjmp_buf;

    /* must unblock signals before calling rebuild_database_list */
    //在调用rebuild_database_list前不能阻塞信号
    PG_SETMASK(&UnBlockSig);

    /*
     * Set always-secure search path.  Launcher doesn't connect to a database,
     * so this has no effect.
     * 设置安全的搜索路径.
     * Launcher不能连接数据库,因此并没有什么影响.
     */
    SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);

    /*
     * Force zero_damaged_pages OFF in the autovac process, even if it is set
     * in postgresql.conf.  We don't really want such a dangerous option being
     * applied non-interactively.
     * 在autovacuum进程中,强制关闭zero_damaged_pages,即时该参数在配置文件设置为ON.
     * 我们真的不希望这样一个危险的选项在无需交互的情况进行应用.
     */
    SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);

    /*
     * Force settable timeouts off to avoid letting these settings prevent
     * regular maintenance from being executed.
     * 强制关闭可设置的超时，以避免这些设置妨碍常规维护的执行。
     */
    SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
    SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
    SetConfigOption("idle_in_transaction_session_timeout", "0",
                    PGC_SUSET, PGC_S_OVERRIDE);

    /*
     * Force default_transaction_isolation to READ COMMITTED.  We don't want
     * to pay the overhead of serializable mode, nor add any risk of causing
     * deadlocks or delaying other transactions.
     * 强制default_transaction_isolation为READ COMMITTED.
     * 我们不希望在serializable模式下增加负担,也不想增加导致死锁或者其他事务延迟的风险.
     */
    SetConfigOption("default_transaction_isolation", "read committed",
                    PGC_SUSET, PGC_S_OVERRIDE);

    /*
     * In emergency mode, just start a worker (unless shutdown was requested)
     * and go away.
     * 在紧急模式,启动一个worker(除非shutdown请求)
     */
    if (!AutoVacuumingActive())
    {
        if (!got_SIGTERM)
            do_start_worker();
        proc_exit(0);           /* done */
    }

    AutoVacuumShmem->av_launcherpid = MyProcPid;

    /*
     * Create the initial database list.  The invariant we want this list to
     * keep is that it's ordered by decreasing next_time.  As soon as an entry
     * is updated to a higher time, it will be moved to the front (which is
     * correct because the only operation is to add autovacuum_naptime to the
     * entry, and time always increases).
     * 创建初始化数据库链表.
     * 我们希望这个链表保持不变的是它是通过减少next_time来进行排序.
     * 一旦条目更新到更高的时间，它就会被移动到前面
     * (这样处理没有问题，因为惟一的操作是向条目添加autovacuum_naptime，而时间总是会增加)。
     */
    rebuild_database_list(InvalidOid);

    /* loop until shutdown request */
    //循环,直至请求shutdown
    while (!got_SIGTERM)
    {
        struct timeval nap;
        TimestampTz current_time = 0;
        bool        can_launch;

        /*
         * This loop is a bit different from the normal use of WaitLatch,
         * because we'd like to sleep before the first launch of a child
         * process.  So it's WaitLatch, then ResetLatch, then check for
         * wakening conditions.
         * 该循环与常规的使用WaitLatch不同,因为我们希望在第一个子进程启动前处于休眠状态.
         * 因此首先WaitLatch,接着ResetLatch,然后检查并等待唤醒条件.
         */

        launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
                                 false, &nap);

        /*
         * Wait until naptime expires or we get some type of signal (all the
         * signal handlers will wake us by calling SetLatch).
         * 等待,直至naptime超时或者我们接收到某些类型的信号.
         * (所有的信号控制器会通过调用SetLatch唤醒进程)
         */
        (void) WaitLatch(MyLatch,
                         WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
                         (nap.tv_sec * 1000L) + (nap.tv_usec / 1000L),
                         WAIT_EVENT_AUTOVACUUM_MAIN);

        ResetLatch(MyLatch);

        /* Process sinval catchup interrupts that happened while sleeping */
        //在休眠过程中,进程会捕获相关的中断.
        ProcessCatchupInterrupt();

        /* the normal shutdown case */
        //shutdonw信号
        if (got_SIGTERM)
            break;

        if (got_SIGHUP)
        {
            //SIGHUP信号
            got_SIGHUP = false;
            ProcessConfigFile(PGC_SIGHUP);

            /* shutdown requested in config file? */
            //在配置文件中已请求shutdown?
            //已处于autovacuum中
            if (!AutoVacuumingActive())
                break;

            /* rebalance in case the default cost parameters changed */
            //如默认的成本参数变化,则自动平衡.
            LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
            autovac_balance_cost();
            LWLockRelease(AutovacuumLock);

            /* rebuild the list in case the naptime changed */
            //如naptime出现变化,重建链表
            rebuild_database_list(InvalidOid);
        }

        /*
         * a worker finished, or postmaster signalled failure to start a
         * worker
         * 某个worker已完成,或者postmaster信号出现异常无法启动worker
         */
        if (got_SIGUSR2)
        {
            //SIGUSR2信号
            got_SIGUSR2 = false;

            /* rebalance cost limits, if needed */
            //如需要,重平衡成本限制
            if (AutoVacuumShmem->av_signal[AutoVacRebalance])
            {
                LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
                AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
                autovac_balance_cost();
                LWLockRelease(AutovacuumLock);
            }

            if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
            {
                /*
                 * If the postmaster failed to start a new worker, we sleep
                 * for a little while and resend the signal.  The new worker's
                 * state is still in memory, so this is sufficient.  After
                 * that, we restart the main loop.
                 * 如果postmaster无法启动新的worker,休眠一段时间,重新发送信号.
                 * 新的worker的状态仍然在内存中,因此这样处理是OK的.
                 * 再次之后,重新启动主循环.
                 *
                 * XXX should we put a limit to the number of times we retry?
                 * I don't think it makes much sense, because a future start
                 * of a worker will continue to fail in the same way.
                 * 是否增加重试次数的限制?XXX
                 * 我们不想太过敏感,因为某个worker在未来的启动会以同样的方式持续失败.
                 */
                AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
                pg_usleep(1000000L);    /* 1s */
                SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
                continue;
            }
        }

        /*
         * There are some conditions that we need to check before trying to
         * start a worker.  First, we need to make sure that there is a worker
         * slot available.  Second, we need to make sure that no other worker
         * failed while starting up.
         * 在尝试启动worker前,有一些条件需要检查.
         * 首先,需要确保有可用的worker slot;其次,需要确保worker在启动时没有出现异常.
         */

        current_time = GetCurrentTimestamp();
        LWLockAcquire(AutovacuumLock, LW_SHARED);

        can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);

        if (AutoVacuumShmem->av_startingWorker != NULL)
        {
            int         waittime;
            WorkerInfo  worker = AutoVacuumShmem->av_startingWorker;

            /*
             * We can't launch another worker when another one is still
             * starting up (or failed while doing so), so just sleep for a bit
             * more; that worker will wake us up again as soon as it's ready.
             * We will only wait autovacuum_naptime seconds (up to a maximum
             * of 60 seconds) for this to happen however.  Note that failure
             * to connect to a particular database is not a problem here,
             * because the worker removes itself from the startingWorker
             * pointer before trying to connect.  Problems detected by the
             * postmaster (like fork() failure) are also reported and handled
             * differently.  The only problems that may cause this code to
             * fire are errors in the earlier sections of AutoVacWorkerMain,
             * before the worker removes the WorkerInfo from the
             * startingWorker pointer.
             * 在某个worker仍然在启动时,不能启动新的worker,因此休眠一段时间;
             * 另外一个worker在ready后会第一时间唤醒我们.
             * 只需要等待autovacuum_naptime参数设置的时间(单位秒)(最大为60s).
             * 注意,在这里不能够连接一个特定的数据库不存在任何问题,因为worker在
             *   尝试连接时,通过startingWorker指针销毁自己.
             * 通过postmaster检测到问题(如fork()失败)会报告并且进行不同的处理,
             *   这里唯一的问题是可能导致这里的处理逻辑在AutoVacWorkerMain的早起触发错误,
             * 而且是在worker通过startingWorker指针清除WorkerInfo前.
             */
            waittime = Min(autovacuum_naptime, 60) * 1000;
            if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
                                           waittime))
            {
                LWLockRelease(AutovacuumLock);
                LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);

                /*
                 * No other process can put a worker in starting mode, so if
                 * startingWorker is still INVALID after exchanging our lock,
                 * we assume it's the same one we saw above (so we don't
                 * recheck the launch time).
                 */
                if (AutoVacuumShmem->av_startingWorker != NULL)
                {
                    worker = AutoVacuumShmem->av_startingWorker;
                    worker->wi_dboid = InvalidOid;
                    worker->wi_tableoid = InvalidOid;
                    worker->wi_sharedrel = false;
                    worker->wi_proc = NULL;
                    worker->wi_launchtime = 0;
                    dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
                                    &worker->wi_links);
                    AutoVacuumShmem->av_startingWorker = NULL;
                    elog(WARNING, "worker took too long to start; canceled");
                }
            }
            else
                can_launch = false;
        }
        //释放锁
        LWLockRelease(AutovacuumLock);  /* either shared or exclusive */

        /* if we can't do anything, just go back to sleep */
        //什么都做不了,继续休眠
        if (!can_launch)
            continue;

        /* We're OK to start a new worker */
        //现在可以启动新的worker  
        if (dlist_is_empty(&DatabaseList))
        {
            /*
             * Special case when the list is empty: start a worker right away.
             * This covers the initial case, when no database is in pgstats
             * (thus the list is empty).  Note that the constraints in
             * launcher_determine_sleep keep us from starting workers too
             * quickly (at most once every autovacuum_naptime when the list is
             * empty).
             * 在链表为空时的特殊情况:正确的启动一个worker.
             * 这涵盖了刚初始的情况，即pgstats中没有数据库(因此链表为空)。
             * 请注意，launcher_determine_sleep中的约束使我们不能过快地启动worker
             * (当链表为空时，最多一次autovacuum_naptime)。
             */
            launch_worker(current_time);
        }
        else
        {
            /*
             * because rebuild_database_list constructs a list with most
             * distant adl_next_worker first, we obtain our database from the
             * tail of the list.
             * 因为rebuild_database_list首先用最远的adl_next_worker构造了链表，
             *   所以我们从链表的尾部获取数据库。
             */
            avl_dbase  *avdb;

            avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);

            /*
             * launch a worker if next_worker is right now or it is in the
             * past
             * 启动worker：如已超过worker的启动时间.
             */
            if (TimestampDifferenceExceeds(avdb->adl_next_worker,
                                           current_time, 0))
                launch_worker(current_time);
        }
    }

    /* Normal exit from the autovac launcher is here */
    //常规的退出.
shutdown:
    ereport(DEBUG1,
            (errmsg("autovacuum launcher shutting down")));
    AutoVacuumShmem->av_launcherpid = 0;

    proc_exit(0);               /* done */
}


/*
 * TimestampDifferenceExceeds -- report whether the difference between two
 *      timestamps is >= a threshold (expressed in milliseconds)
 *
 * Both inputs must be ordinary finite timestamps (in current usage,
 * they'll be results from GetCurrentTimestamp()).
 */
bool
TimestampDifferenceExceeds(TimestampTz start_time,
                           TimestampTz stop_time,
                           int msec)
{
    TimestampTz diff = stop_time - start_time;

    return (diff >= msec * INT64CONST(1000));
}


/*
 * Return the address of the last element in the list.
 *
 * The list must not be empty.
 */
#define dlist_tail_element(type, membername, lhead)                         \
    (AssertVariableIsOfTypeMacro(((type *) NULL)->membername, dlist_node),  \
     ((type *) dlist_tail_element_off(lhead, offsetof(type, membername))))

/* internal support function to get address of tail element's struct */
static inline void *
dlist_tail_element_off(dlist_head *head, size_t off)
{
    Assert(!dlist_is_empty(head));
    return (char *) head->head.prev - off;
}

三、跟踪分析

N/A

四、参考资料

PG Source Code