diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 41278eaf9020223d0842849acf0920ebcd4a09cc..bcb3a3779fcc6a7be7db5a0f80e978889ed7c543 100755 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -104,6 +104,7 @@ install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/access DESTINATION include else("${ENABLE_MULTIPLE_NODES}" STREQUAL "OFF") install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/access DESTINATION include/postgresql/server PATTERN "extreme_rto" EXCLUDE + PATTERN "ondemand_extreme_rto" EXCLUDE PATTERN "*.h") endif() install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/mb DESTINATION include/postgresql/server) diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index 05d128ef36401b71f929b8a8e93f59b6e1d9f17f..6e008c4c3dfae452331766d713663ed0f9d7f09e 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -719,8 +719,10 @@ ss_enable_catalog_centralized|bool|0,0|NULL|NULL| ss_enable_reform|bool|0,0|NULL|NULL| ss_enable_ssl|bool|0,0|NULL|NULL| ss_enable_aio|bool|0,0|NULL|NULL| +ss_enable_ondemand_recovery|bool|0,0|NULL|NULL| ss_interconnect_channel_count|int|1,32|NULL|NULL| ss_work_thread_count|int|16,128|NULL|NULL| +ss_ondemand_recovery_mem_size|int|1048576,104857600|kB|NULL| ss_recv_msg_pool_size|int|1024,1048576|kB|NULL| ss_interconnect_type|string|0,0|NULL|NULL| ss_log_level|int|0,887|NULL|NULL| diff --git a/src/bin/pg_controldata/pg_controldata.cpp b/src/bin/pg_controldata/pg_controldata.cpp index 845d552c7e849561eaee3e251abe121e0ec371ca..9666a066e5595d26e2cfd64726220f088f5298de 100644 --- a/src/bin/pg_controldata/pg_controldata.cpp +++ b/src/bin/pg_controldata/pg_controldata.cpp @@ -84,6 +84,20 @@ static const char* dbState(DBState state) return _("unrecognized status code"); } +static const char* SSClusterState(SSGlobalClusterState state) { + switch (state) { + case CLUSTER_IN_ONDEMAND_BUILD: + return _("in on-demand build"); + case CLUSTER_IN_ONDEMAND_RECOVERY: + return _("in on-demand recovery"); + case CLUSTER_NORMAL: + return _("normal"); + default: + break; + } + return _("unrecognized status code"); +} + static const char* wal_level_str(WalLevel wal_level) { switch (wal_level) { @@ -244,8 +258,11 @@ static void display_last_page(ss_reformer_ctrl_t reformerCtrl, int last_page_id) "is expecting. The results below are untrustworthy.\n\n")); } printf(_("\nreformer data (last page id %d)\n\n"), last_page_id); + printf(_("Reform control version number: %u\n"), reformerCtrl.version); printf(_("Stable instances list: %lu\n"), reformerCtrl.list_stable); printf(_("Primary instance ID: %d\n"), reformerCtrl.primaryInstId); + printf(_("Recovery instance ID: %d\n"), reformerCtrl.recoveryInstId); + printf(_("Cluster status: %s\n"), SSClusterState(reformerCtrl.clusterStatus)); } int main(int argc, char* argv[]) @@ -390,7 +407,7 @@ int main(int argc, char* argv[]) exit_safely(2); } display_control_page(ControlFile, display_id, display_all); - } + } /* get the last page from the the pg_control in shared storage mode */ if (enable_dss && display_id > MAX_INSTANCEID) { diff --git a/src/common/backend/parser/analyze.cpp b/src/common/backend/parser/analyze.cpp index 6263436531dd6a8a93fb03cc2025ab06c865905b..e9714537766ce7fe8c972bc7f6af052d1484677a 100644 --- a/src/common/backend/parser/analyze.cpp +++ b/src/common/backend/parser/analyze.cpp @@ -614,7 +614,8 @@ Query* transformStmt(ParseState* pstate, Node* parseTree, bool isFirstNode, bool if (nodeTag(parseTree) != T_InsertStmt) { result->rightRefState = nullptr; } - + + PreventCommandDuringSSOndemandRecovery(parseTree); return result; } diff --git a/src/common/backend/utils/error/elog.cpp b/src/common/backend/utils/error/elog.cpp index d9c2b8200169f362777fed0c38a811808b407713..e20efa624c549978e857ffb364aa95ff2673f4f1 100644 --- a/src/common/backend/utils/error/elog.cpp +++ b/src/common/backend/utils/error/elog.cpp @@ -4284,6 +4284,54 @@ static void append_with_tabs(StringInfo buf, const char* str) } } +/* + * Reaper -- get current time. + */ +void get_time_now(char* nowTime, int timeLen) +{ + time_t formatTime; + struct timeval current = {0}; + const int tmpBufSize = 32; + char tmpBuf[tmpBufSize] = {0}; + + if (nowTime == NULL || timeLen == 0) { + return; + } + + (void)gettimeofday(¤t, NULL); + formatTime = current.tv_sec; + struct tm* pTime = localtime(&formatTime); + strftime(tmpBuf, sizeof(tmpBuf), "%Y-%m-%d %H:%M:%S", pTime); + + errno_t rc = sprintf_s(nowTime, timeLen - 1, "%s.%ld ", tmpBuf, current.tv_usec / 1000); + securec_check_ss(rc, "\0", "\0"); +} + +void write_stderr_with_prefix(const char* fmt, ...) +{ + va_list ap; + const int timeBufSize = 256; + const int bufSize = 2048; + char timeBuf[timeBufSize] = {0}; + char buf[bufSize] = {0}; + + /* syslogger thread can not write log into pipe */ + if (t_thrd.role == SYSLOGGER) { + return; + } + + get_time_now(timeBuf, timeBufSize); + + fmt = _(fmt); + va_start(ap, fmt); + errno_t rc = sprintf_s(buf, bufSize - 1, "%s[%lu] %s\n", timeBuf, t_thrd.proc_cxt.MyProcPid, fmt); + securec_check_ss(rc, "\0", "\0"); + + vfprintf(stderr, buf, ap); + fflush(stderr); + va_end(ap); +} + /* * Write errors to stderr (or by equal means when stderr is * not available). Used before ereport/elog can be used diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index 9acbdf5b9e45184f9853e9d625bcdcdcda35c216..879821fd06e79d1330448dedbb9aea4b1372e1c9 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -75,12 +75,13 @@ bool will_shutdown = false; * NEXT | 92899 | ? | ? * ********************************************/ -const uint32 GRAND_VERSION_NUM = 92900; +const uint32 GRAND_VERSION_NUM = 92901; /******************************************** * 2.VERSION NUM FOR EACH FEATURE * Please write indescending order. ********************************************/ +const uint32 ONDEMAND_REDO_VERSION_NUM = 92901; const uint32 SRF_FUSION_VERSION_NUM = 92847; const uint32 INDEX_HINT_VERSION_NUM = 92845; const uint32 INNER_UNIQUE_VERSION_NUM = 92845; diff --git a/src/common/backend/utils/init/miscinit.cpp b/src/common/backend/utils/init/miscinit.cpp index 72972fed8eb4adf571771743d95f7076e305e1eb..aa5483b8d38c49955a2938db03ebc5604acafd24 100644 --- a/src/common/backend/utils/init/miscinit.cpp +++ b/src/common/backend/utils/init/miscinit.cpp @@ -63,6 +63,7 @@ #include "utils/lsyscache.h" #include "gs_policy/policy_common.h" #include "storage/file/fio_device.h" +#include "ddes/dms/ss_reform_common.h" #ifdef ENABLE_MULTIPLE_NODES #include "tsdb/compaction/compaction_entry.h" @@ -2039,6 +2040,17 @@ void register_backend_version(uint32 backend_version){ } } +void SSUpgradeFileBeforeCommit() +{ + // upgrade reform control file + if (pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) { + if (SS_PRIMARY_MODE) { + SSReadControlFile(REFORM_CTRL_PAGE); + SSSaveReformerCtrl(true); + } + } +} + /* * Check whether the version contains the backend_version parameter. */ @@ -2092,63 +2104,71 @@ void ss_initdwsubdir(char *dssdir, int instance_id) g_instance.datadir_cxt.dw_subdir_cxt.dwStorageType = (uint8)DEV_TYPE_DSS; } -/* - * Check whether dss connect is successful. - */ -void initDSSConf(void) +void initDssPath(char *dssdir) { - if (!ENABLE_DSS) { - return; - } - if (!dss_exist_dir(g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name)) { - ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("Could not connect dssserver, vgname: \"%s\", socketpath: \"%s\"", - g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name, - g_instance.attr.attr_storage.dss_attr.ss_dss_conn_path), - errhint("Check vgname and socketpath and restart later."))); - } else { - errno_t rc = EOK; - char *dssdir = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name; + errno_t rc = EOK; - rc = snprintf_s(g_instance.datadir_cxt.baseDir, MAXPGPATH, MAXPGPATH - 1, "%s/base", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.baseDir, MAXPGPATH, MAXPGPATH - 1, "%s/base", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.globalDir, MAXPGPATH, MAXPGPATH - 1, "%s/global", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.globalDir, MAXPGPATH, MAXPGPATH - 1, "%s/global", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.locationDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_location", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.locationDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_location", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.tblspcDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_tblspc", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.tblspcDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_tblspc", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.clogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_clog", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.clogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_clog", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.csnlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_csnlog", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.csnlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_csnlog", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.serialDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_serial", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.serialDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_serial", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.twophaseDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_twophase", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.multixactDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_multixact", dssdir); + securec_check_ss(rc, "", ""); + + rc = snprintf_s(g_instance.datadir_cxt.xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", dssdir, + g_instance.attr.attr_storage.dms_attr.instance_id); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.twophaseDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_twophase", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.controlPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control", dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.multixactDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_multixact", dssdir); - securec_check_ss(rc, "", ""); + rc = snprintf_s(g_instance.datadir_cxt.controlBakPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control.backup", + dssdir); + securec_check_ss(rc, "", ""); - rc = snprintf_s(g_instance.datadir_cxt.xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", dssdir, - g_instance.attr.attr_storage.dms_attr.instance_id); - securec_check_ss(rc, "", ""); + ss_initdwsubdir(dssdir, g_instance.attr.attr_storage.dms_attr.instance_id); +} - rc = snprintf_s(g_instance.datadir_cxt.controlPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control", dssdir); - securec_check_ss(rc, "", ""); +void initDSSConf(void) +{ + if (!ENABLE_DSS) { + return; + } - rc = snprintf_s(g_instance.datadir_cxt.controlBakPath, MAXPGPATH, MAXPGPATH - 1, "%s/pg_control.backup", - dssdir); - securec_check_ss(rc, "", ""); + // check whether dss connect is successful. + if (!dss_exist_dir(g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name)) { + ereport(FATAL, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("Could not connect dssserver, vgname: \"%s\", socketpath: \"%s\"", + g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name, + g_instance.attr.attr_storage.dss_attr.ss_dss_conn_path), + errhint("Check vgname and socketpath and restart later."))); + } else { + char *dssdir = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name; - ss_initdwsubdir(dssdir, g_instance.attr.attr_storage.dms_attr.instance_id); + // do not overwrite + if (strncmp(g_instance.datadir_cxt.baseDir, dssdir, strlen(dssdir)) != 0) { + initDssPath(dssdir); + } } /* set xlog seg size to 1GB */ diff --git a/src/common/backend/utils/misc/guc-file.l b/src/common/backend/utils/misc/guc-file.l index 3a7096b9cab2c05ef40b5ccc899b62bcb475f7a7..fc79beb87c8a385189aeec4e88f0c4ca4ede994a 100644 --- a/src/common/backend/utils/misc/guc-file.l +++ b/src/common/backend/utils/misc/guc-file.l @@ -330,6 +330,7 @@ ProcessConfigFile(GucContext context) case MASTER_THREAD: { if (strcmp(item->name, "upgrade_mode") == 0) { if (strcmp(pre_value, "0") != 0 && strcmp(post_value, "0") == 0) { + SSUpgradeFileBeforeCommit(); pg_atomic_write_u32(&WorkingGrandVersionNum, GRAND_VERSION_NUM); } } diff --git a/src/common/backend/utils/misc/guc/guc_storage.cpp b/src/common/backend/utils/misc/guc/guc_storage.cpp index 931a0044c174482c1c8af8428fa8fdc9cc0c9ef2..650561b61c2d07249c8839fa7082b22a07ebea99 100755 --- a/src/common/backend/utils/misc/guc/guc_storage.cpp +++ b/src/common/backend/utils/misc/guc/guc_storage.cpp @@ -215,6 +215,7 @@ static bool check_ss_rdma_work_config(char** newval, void** extra, GucSource sou static bool check_ss_dss_vg_name(char** newval, void** extra, GucSource source); static bool check_ss_dss_conn_path(char** newval, void** extra, GucSource source); static bool check_ss_enable_ssl(bool* newval, void** extra, GucSource source); +static bool check_ss_enable_ondemand_recovery(bool* newval, void** extra, GucSource source); static void assign_ss_enable_aio(bool newval, void *extra); #ifdef USE_ASSERT_CHECKING static void assign_ss_enable_verify_page(bool newval, void *extra); @@ -1035,6 +1036,19 @@ static void InitStorageConfigureNamesBool() NULL, NULL}, + {{"ss_enable_ondemand_recovery", + PGC_POSTMASTER, + NODE_SINGLENODE, + SHARED_STORAGE_OPTIONS, + gettext_noop("Whether use on-demand recovery"), + NULL, + GUC_SUPERUSER_ONLY}, + &g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery, + false, + check_ss_enable_ondemand_recovery, + NULL, + NULL}, + #ifdef USE_ASSERT_CHECKING {{"ss_enable_verify_page", PGC_SIGHUP, @@ -3608,7 +3622,21 @@ static void InitStorageConfigureNamesInt() 64, NULL, NULL, - NULL}, + NULL}, + {{"ss_ondemand_recovery_mem_size", + PGC_POSTMASTER, + NODE_ALL, + SHARED_STORAGE_OPTIONS, + gettext_noop("Sets the number of on-demand recovery memory buffers."), + NULL, + GUC_SUPERUSER_ONLY | GUC_UNIT_KB}, + &g_instance.attr.attr_storage.dms_attr.ondemand_recovery_mem_size, + 4194304, + 1048576, + 104857600, + NULL, + NULL, + NULL}, /* End-of-list marker */ {{NULL, (GucContext)0, @@ -6031,6 +6059,17 @@ static bool check_ss_enable_ssl(bool *newval, void **extra, GucSource source) return true; } +static bool check_ss_enable_ondemand_recovery(bool* newval, void** extra, GucSource source) +{ + if (*newval) { + if (pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) { + ereport(ERROR, (errmsg("Do not allow enable ondemand_recovery if openGauss run in old version."))); + return false; + } + } + return true; +} + #ifdef USE_ASSERT_CHECKING static void assign_ss_enable_verify_page(bool newval, void *extra) { diff --git a/src/common/backend/utils/misc/postgresql_single.conf.sample b/src/common/backend/utils/misc/postgresql_single.conf.sample index 60d8778f3ea8f3118fde947a9b855af7e9dd2d7e..1194424f55201cd9117a08cca86c0f62221479fc 100644 --- a/src/common/backend/utils/misc/postgresql_single.conf.sample +++ b/src/common/backend/utils/misc/postgresql_single.conf.sample @@ -849,3 +849,5 @@ job_queue_processes = 10 # Number of concurrent jobs, optional: [0..1000] #ss_log_backup_file_count = 10 #ss_log_max_file_size = 10MB #ss_parallel_thread_count = 16 +#ss_enable_ondemand_recovery = off +#ss_ondemand_recovery_mem_size = 4GB # min: 1GB, max: 100GB diff --git a/src/gausskernel/CMakeLists.txt b/src/gausskernel/CMakeLists.txt index 839f5dc795ced0406e80cb909c198a41c384f150..29fa43a5ce2e54dea57f02ff8633d99477b7a662 100755 --- a/src/gausskernel/CMakeLists.txt +++ b/src/gausskernel/CMakeLists.txt @@ -170,6 +170,7 @@ list(APPEND gaussdb_objects $ $ $ + $ $ $ $ diff --git a/src/gausskernel/ddes/adapter/ss_dms.cpp b/src/gausskernel/ddes/adapter/ss_dms.cpp index 2dfdb8750a9c677bbc44ba65685bfe758cf7c7ee..5f4304530b020fa78e41d029b2a135fc2377b3c1 100644 --- a/src/gausskernel/ddes/adapter/ss_dms.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms.cpp @@ -125,6 +125,7 @@ int ss_dms_func_init() SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_init_logger)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_refresh_logger)); SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_validate_drc)); + SS_RETURN_IFERR(DMS_LOAD_SYMBOL_FUNC(dms_reform_req_opengauss_ondemand_redo_buffer)); g_ss_dms_func.inited = true; return DMS_SUCCESS; } @@ -333,4 +334,10 @@ void dms_validate_drc(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned lon unsigned char is_dirty) { return g_ss_dms_func.dms_validate_drc(dms_ctx, ctrl, lsn, is_dirty); +} + +int dms_reform_req_opengauss_ondemand_redo_buffer(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status) +{ + return g_ss_dms_func.dms_reform_req_opengauss_ondemand_redo_buffer(dms_ctx, block_key, key_len, redo_status); } \ No newline at end of file diff --git a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp index bc1700887325e8727914c9abf7e03bfdbe118213..0cf347208ce259595e938806664f1f3fce41b27e 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_bufmgr.cpp @@ -28,6 +28,7 @@ #include "storage/smgr/segment.h" #include "utils/resowner.h" #include "ddes/dms/ss_dms_bufmgr.h" +#include "ddes/dms/ss_reform_common.h" #include "securec_check.h" #include "miscadmin.h" #include "access/double_write.h" @@ -244,7 +245,7 @@ void SmgrNetPageCheckDiskLSN(BufferDesc *buf_desc, ReadBufferMode read_mode, con if ((lsn_on_mem != InvalidXLogRecPtr) && (lsn_on_disk > lsn_on_mem)) { RelFileNode rnode = buf_desc->tag.rnode; int elevel = WARNING; - if (!RecoveryInProgress()) { + if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY) { elevel = PANIC; } ereport(elevel, (errmsg("[%d/%d/%d/%d/%d %d-%d] memory lsn(0x%llx) is less than disk lsn(0x%llx)", @@ -302,6 +303,15 @@ Buffer TerminateReadPage(BufferDesc* buf_desc, ReadBufferMode read_mode, const X ClearReadHint(buf_desc->buf_id); TerminateBufferIO(buf_desc, false, BM_VALID); + + /* + * we need redo items to get lastest page in ondemand recovery + */ + if (t_thrd.role != PAGEREDO && SS_ONDEMAND_BUILD_DONE && SS_PRIMARY_MODE && + !LWLockHeldByMe(buf_desc->content_lock)) { + buf_desc = RedoForOndemandExtremeRTOQuery(buf_desc, RELPERSISTENCE_PERMANENT, buf_desc->tag.forkNum, + buf_desc->tag.blockNum, read_mode); + } return buffer; } @@ -472,12 +482,62 @@ Buffer DmsReadPage(Buffer buffer, LWLockMode mode, ReadBufferMode read_mode, boo return buffer; } + // standby node must notify primary node for prepare lastest page in ondemand recovery + if (SS_STANDBY_ONDEMAND_RECOVERY) { + while (!SSOndemandRequestPrimaryRedo(buf_desc->tag)) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_STANDBY_ONDEMAND_NORMAL) { + break; // ondemand recovery finish, skip + } else if (SS_STANDBY_ONDEMAND_BUILD) { + return 0; // in new reform + } + // still need requset page + } + } + if (!StartReadPage(buf_desc, mode)) { return 0; } return TerminateReadPage(buf_desc, read_mode, OidIsValid(buf_ctrl->pblk_relno) ? &pblk : NULL); } +bool SSOndemandRequestPrimaryRedo(BufferTag tag) +{ + dms_context_t dms_ctx; + int32 redo_status = ONDEMAND_REDO_INVALID; + + if (!SS_STANDBY_ONDEMAND_RECOVERY) { + return true; + } + + ereport(DEBUG1, + (errmodule(MOD_DMS), + errmsg("[On-demand] start request primary node redo page, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u", + tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, tag.rnode.bucketNode, tag.forkNum, + tag.blockNum))); + InitDmsContext(&dms_ctx); + dms_ctx.xmap_ctx.dest_id = (unsigned int)SS_PRIMARY_ID; + if (dms_reform_req_opengauss_ondemand_redo_buffer(&dms_ctx, &tag, + (unsigned int)sizeof(BufferTag), &redo_status) != DMS_SUCCESS) { + ereport(LOG, + (errmodule(MOD_DMS), + errmsg("[on-demand] request primary node redo page failed, page id [%d/%d/%d/%d/%d %d-%d], " + "redo statu %d", tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, (int)tag.rnode.bucketNode, + (int)tag.rnode.opt, tag.forkNum, tag.blockNum, redo_status))); + return false; + } + ereport(DEBUG1, + (errmodule(MOD_DMS), + errmsg("[On-demand] end request primary node redo page, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, " + "redo status %d", tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, tag.rnode.bucketNode, + tag.forkNum, tag.blockNum, redo_status))); + + if (redo_status != ONDEMAND_REDO_DONE) { + SSReadControlFile(REFORM_CTRL_PAGE); + } + return true; +} + bool DmsReleaseOwner(BufferTag buf_tag, int buf_id) { dms_buf_ctrl_t *buf_ctrl = GetDmsBufCtrl(buf_id); @@ -674,7 +734,8 @@ bool CheckPageNeedSkipInRecovery(Buffer buf) dms_session_e DMSGetProcType4RequestPage() { // proc type used in DMS request page - if (AmDmsReformProcProcess() || AmPageRedoProcess() || AmStartupProcess()) { + if (AmDmsReformProcProcess() || (AmPageRedoProcess() && !SS_ONDEMAND_BUILD_DONE) || + (AmStartupProcess() && !SS_ONDEMAND_BUILD_DONE)) { /* When xlog_file_path is not null and enable_dms is set on, main standby always is in recovery. * When pmState is PM_HOT_STANDBY, this case indicates main standby support to read only. So here * DMS_SESSION_RECOVER_HOT_STANDBY will be returned, it indicates that normal threads can access diff --git a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp index b61c1a97041b6cb4ba39f573ffdd637978675d64..c519479d81356792d47e9acad68f7c28b37b83c4 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_callback.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_callback.cpp @@ -1714,6 +1714,7 @@ static int CBReformDoneNotify(void *db_handle) g_instance.dms_cxt.SSRecoveryInfo.startup_reform = false; g_instance.dms_cxt.SSRecoveryInfo.restart_failover_flag = false; g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = NOT_ACTIVE; + SSReadControlFile(REFORM_CTRL_PAGE); Assert(g_instance.dms_cxt.SSRecoveryInfo.in_flushcopy == false); ereport(LOG, (errmodule(MOD_DMS), @@ -1831,6 +1832,57 @@ void DmsCallbackThreadShmemInit(unsigned char need_startup, char **reg_data) t_thrd.postgres_cxt.whereToSendOutput = (int)DestNone; } +int CBOndemandRedoPageForStandby(void *block_key, int32 *redo_status) +{ + BufferTag* tag = (BufferTag *)block_key; + + Assert(SS_PRIMARY_MODE); + // do nothing if not in ondemand recovery + if (!SS_IN_ONDEMAND_RECOVERY) { + ereport(DEBUG1, (errmsg("[On-demand] ignore redo page request, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u", tag->rnode.spcNode, tag->rnode.dbNode, + tag->rnode.relNode, tag->rnode.bucketNode, tag->forkNum, tag->blockNum))); + *redo_status = ONDEMAND_REDO_SKIP; + return GS_SUCCESS;; + } + + Buffer buffer; + SegSpace *spc = NULL; + uint32 saveInterruptHoldoffCount = t_thrd.int_cxt.InterruptHoldoffCount; + *redo_status = ONDEMAND_REDO_DONE; + PG_TRY(); + { + if (IsSegmentPhysicalRelNode(tag->rnode)) { + spc = spc_open(tag->rnode.spcNode, tag->rnode.dbNode, false, false); + buffer = ReadBufferFast(spc, tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL); + } else { + buffer = ReadBufferWithoutRelcache(tag->rnode, tag->forkNum, tag->blockNum, RBM_NORMAL, NULL, NULL); + } + ReleaseBuffer(buffer); + } + PG_CATCH(); + { + t_thrd.int_cxt.InterruptHoldoffCount = saveInterruptHoldoffCount; + /* Save error info */ + ErrorData* edata = CopyErrorData(); + FlushErrorState(); + FreeErrorData(edata); + ereport(PANIC, (errmsg("[On-demand] Error happend when primary redo page for standby, spc/db/rel/bucket " + "fork-block: %u/%u/%u/%d %d-%u", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, + tag->rnode.bucketNode, tag->forkNum, tag->blockNum))); + } + PG_END_TRY(); + + if (BufferIsInvalid(buffer)) { + *redo_status = ONDEMAND_REDO_FAIL; + } + + ereport(DEBUG1, (errmsg("[On-demand] redo page for standby done, spc/db/rel/bucket fork-block: %u/%u/%u/%d %d-%u, " + "redo status: %d", tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, + tag->rnode.bucketNode, tag->forkNum, tag->blockNum, *redo_status))); + return GS_SUCCESS;; +} + void DmsInitCallback(dms_callback_t *callback) { // used in reform @@ -1850,6 +1902,7 @@ void DmsInitCallback(dms_callback_t *callback) callback->failover_promote_opengauss = CBFailoverPromote; callback->reform_start_notify = CBReformStartNotify; callback->reform_set_dms_role = CBReformSetDmsRole; + callback->opengauss_ondemand_redo_buffer = CBOndemandRedoPageForStandby; callback->get_page_hash_val = CBPageHashCode; callback->read_local_page4transfer = CBEnterLocalPage; diff --git a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp index e9ae32649ba9237b07bb06fbc1a480102e9cd4ba..158d90615b94354fa1b9ae48a62892c2300df1be 100644 --- a/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp +++ b/src/gausskernel/ddes/adapter/ss_dms_recovery.cpp @@ -119,7 +119,13 @@ bool SSRecoveryNodes() break; } LWLockRelease(ControlFileLock); - + + /* do not wait when on-demand HashMap build done */ + if (SS_ONDEMAND_BUILD_DONE) { + result = true; + break; + } + /* If main standby is set hot standby to on, when it reach consistency or recovery all xlogs in disk, * recovery phase could be regarded successful in hot_standby thus set pmState = PM_HOT_STANDBY, which * indicate database systerm is ready to accept read only connections. @@ -149,98 +155,8 @@ bool SSRecoveryApplyDelay() return true; } -void SSReadControlFile(int id, bool updateDmsCtx) -{ - pg_crc32c crc; - errno_t rc = EOK; - int fd = -1; - char *fname = NULL; - bool retry = false; - int read_size = 0; - int len = 0; - fname = XLOG_CONTROL_FILE; - -loop: - fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); - if (fd < 0) { - ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); - } - - off_t seekpos = (off_t)BLCKSZ * id; - - if (id == REFORM_CTRL_PAGE) { - len = sizeof(ss_reformer_ctrl_t); - } else { - len = sizeof(ControlFileData); - } - - read_size = (int)BUFFERALIGN(len); - char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); - if (pread(fd, buffer, read_size, seekpos) != read_size) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); - } - - if (id == REFORM_CTRL_PAGE) { - rc = memcpy_s(&g_instance.dms_cxt.SSReformerControl, len, buffer, len); - securec_check(rc, "", ""); - if (close(fd) < 0) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); - } - - /* Now check the CRC. */ - INIT_CRC32C(crc); - COMP_CRC32C(crc, (char *)&g_instance.dms_cxt.SSReformerControl, offsetof(ss_reformer_ctrl_t, crc)); - FIN_CRC32C(crc); - - if (!EQ_CRC32C(crc, g_instance.dms_cxt.SSReformerControl.crc)) { - if (retry == false) { - ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); - fname = XLOG_CONTROL_FILE_BAK; - retry = true; - goto loop; - } else { - ereport(FATAL, (errmsg("incorrect checksum in control file"))); - } - } - } else { - ControlFileData* controlFile = NULL; - ControlFileData tempControlFile; - if (updateDmsCtx) { - controlFile = &tempControlFile; - } else { - controlFile = t_thrd.shemem_ptr_cxt.ControlFile; - } - - rc = memcpy_s(controlFile, (size_t)len, buffer, (size_t)len); - securec_check(rc, "", ""); - if (close(fd) < 0) { - ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); - } - - /* Now check the CRC. */ - INIT_CRC32C(crc); - COMP_CRC32C(crc, (char *)controlFile, offsetof(ControlFileData, crc)); - FIN_CRC32C(crc); - - if (!EQ_CRC32C(crc, controlFile->crc)) { - if (retry == false) { - ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); - fname = XLOG_CONTROL_FILE_BAK; - retry = true; - goto loop; - } else { - ereport(FATAL, (errmsg("incorrect checksum in control file"))); - } - } - - if (XLByteLE(g_instance.dms_cxt.ckptRedo, controlFile->checkPointCopy.redo)) { - g_instance.dms_cxt.ckptRedo = controlFile->checkPointCopy.redo; - } - } -} - /* initialize reformer ctrl parameter when initdb */ -void SSWriteReformerControlPages(void) +void SSInitReformerControlPages(void) { /* * If already exists control file, reformer page must have been initialized @@ -268,6 +184,9 @@ void SSWriteReformerControlPages(void) Assert(!dss_exist_file(XLOG_CONTROL_FILE)); g_instance.dms_cxt.SSReformerControl.list_stable = 0; g_instance.dms_cxt.SSReformerControl.primaryInstId = SS_MY_INST_ID; + g_instance.dms_cxt.SSReformerControl.recoveryInstId = INVALID_INSTANCEID; + g_instance.dms_cxt.SSReformerControl.version = REFORM_CTRL_VERSION; + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; (void)printf("[SS] Current node:%d initdb first, will become PRIMARY for first-time SS cluster startup.\n", SS_MY_INST_ID); @@ -370,4 +289,4 @@ void ss_switchover_promoting_dw_init() dw_init(); g_instance.dms_cxt.dw_init = true; ereport(LOG, (errmodule(MOD_DMS), errmsg("[SS switchover] dw init finished"))); -} \ No newline at end of file +} diff --git a/src/gausskernel/ddes/adapter/ss_reform_common.cpp b/src/gausskernel/ddes/adapter/ss_reform_common.cpp index a0f67de2586551c4ae72ec0472a0b0d4a6286380..c3f1bedfb546c86f46c01d7edea000530fa5ef60 100644 --- a/src/gausskernel/ddes/adapter/ss_reform_common.cpp +++ b/src/gausskernel/ddes/adapter/ss_reform_common.cpp @@ -99,24 +99,16 @@ int SSXLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources, char* xlog_ return -1; } -static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) -{ - if (t_thrd.xlog_cxt.readSource == XLOG_FROM_PG_XLOG && emode == LOG) { - if (XLByteEQ(RecPtr, t_thrd.xlog_cxt.lastComplaint)) { - emode = DEBUG1; - } else { - t_thrd.xlog_cxt.lastComplaint = RecPtr; - } - } - return emode; -} - -bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf) +int SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf, + int readLen) { uint32 preReadOff; XLogRecPtr xlogFlushPtrForPerRead = xlogreader->xlogFlushPtrForPerRead; bool isReadFile = true; + Assert(readLen > 0); + Assert(readLen <= XLogPreReadSize); + do { /* * That source is XLOG_FROM_STREAM indicate that walreceiver receive xlog and walrecwriter have wrriten xlog @@ -136,7 +128,7 @@ bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, X if ((XLByteInPreReadBuf(targetPagePtr, xlogreader->preReadStartPtr) && !SS_STANDBY_CLUSTER_MAIN_STANDBY) || (!isReadFile)) { preReadOff = targetPagePtr % XLogPreReadSize; - int err = memcpy_s(buf, XLOG_BLCKSZ, xlogreader->preReadBuf + preReadOff, XLOG_BLCKSZ); + int err = memcpy_s(buf, readLen, xlogreader->preReadBuf + preReadOff, readLen); securec_check(err, "\0", "\0"); break; } else { @@ -155,7 +147,7 @@ bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, X } } while (true); - return true; + return readLen; } XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize) @@ -190,28 +182,131 @@ XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private return state; } -void SSGetXlogPath() +void SSGetRecoveryXlogPath() { - int primaryId = -1; errno_t rc = EOK; char *dssdir = g_instance.attr.attr_storage.dss_attr.ss_dss_vg_name; - /* get primary inst id */ - primaryId = SSGetPrimaryInstId(); - rc = snprintf_s(g_instance.dms_cxt.SSRecoveryInfo.recovery_xlogDir, MAXPGPATH, MAXPGPATH - 1, "%s/pg_xlog%d", - dssdir, primaryId); + dssdir, g_instance.dms_cxt.SSReformerControl.recoveryInstId); securec_check_ss(rc, "", ""); } -void SSSaveReformerCtrl() +static void SSSaveOldReformerCtrl() +{ + ss_reformer_ctrl_t new_ctrl = g_instance.dms_cxt.SSReformerControl; + ss_old_reformer_ctrl_t old_ctrl = {new_ctrl.list_stable, new_ctrl.primaryInstId, new_ctrl.crc}; + + int len = sizeof(ss_old_reformer_ctrl_t); + int write_size = (int)BUFFERALIGN(len); + char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = { 0 }; + char *fname[2]; + int fd = -1; + + errno_t err = memcpy_s(&buffer, write_size, &old_ctrl, len); + securec_check(err, "\0", "\0"); + + INIT_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc); + COMP_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc, (char *)buffer, offsetof(ss_old_reformer_ctrl_t, crc)); + FIN_CRC32C(((ss_old_reformer_ctrl_t *)buffer)->crc); + + fname[0] = XLOG_CONTROL_FILE_BAK; + fname[1] = XLOG_CONTROL_FILE; + + for (int i = 0; i < BAK_CTRL_FILE_NUM; i++) { + if (i == 0) { + fd = BasicOpenFile(fname[i], O_CREAT | O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + } else { + fd = BasicOpenFile(fname[i], O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + } + + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname[i]))); + } + + SSWriteInstanceControlFile(fd, buffer, REFORM_CTRL_PAGE, write_size); + if (close(fd)) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + } +} + +static bool SSReadOldReformerCtrl() +{ + ss_reformer_ctrl_t *new_ctrl = &g_instance.dms_cxt.SSReformerControl; + ss_old_reformer_ctrl_t old_ctrl; + pg_crc32c crc; + int fd = -1; + bool retry = false; + char *fname = XLOG_CONTROL_FILE; + +loop: + fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); + } + + off_t seekpos = (off_t)BLCKSZ * REFORM_CTRL_PAGE; + int len = sizeof(ss_old_reformer_ctrl_t); + + int read_size = (int)BUFFERALIGN(len); + char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); + if (pread(fd, buffer, read_size, seekpos) != read_size) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); + } + + errno_t rc = memcpy_s(&old_ctrl, len, buffer, len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)&old_ctrl, offsetof(ss_old_reformer_ctrl_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, old_ctrl.crc)) { + if (retry == false) { + ereport(WARNING, + (errmsg("control file \"%s\" contains incorrect checksum in upgrade mode, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(WARNING, + (errmsg("backup control file \"%s\" contains incorrect checksum in upgrade mode, " + "try again in post-upgrade mode", fname))); + return false; + } + } + + // new params set to initial value + new_ctrl->version = REFORM_CTRL_VERSION;; + new_ctrl->recoveryInstId = INVALID_INSTANCEID; + new_ctrl->clusterStatus = CLUSTER_NORMAL; + + // exist param inherit + new_ctrl->primaryInstId = old_ctrl.primaryInstId; + new_ctrl->list_stable = old_ctrl.list_stable; + new_ctrl->crc = old_ctrl.crc; + + return true; +} + +void SSSaveReformerCtrl(bool force) { int fd = -1; int len; errno_t err = EOK; char *fname[2]; - len = sizeof(ss_reformer_ctrl_t); + if ((pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) && !force) { + SSSaveOldReformerCtrl(); + return; + } + + len = sizeof(ss_reformer_ctrl_t); int write_size = (int)BUFFERALIGN(len); char buffer[write_size] __attribute__((__aligned__(ALIGNOF_BUFFER))) = { 0 }; @@ -243,6 +338,110 @@ void SSSaveReformerCtrl() } } +void SSReadControlFile(int id, bool updateDmsCtx) +{ + pg_crc32c crc; + errno_t rc = EOK; + int fd = -1; + char *fname = NULL; + bool retry = false; + int read_size = 0; + int len = 0; + fname = XLOG_CONTROL_FILE; + + if ((pg_atomic_read_u32(&WorkingGrandVersionNum) < ONDEMAND_REDO_VERSION_NUM) && (id == REFORM_CTRL_PAGE)) { + if (SSReadOldReformerCtrl()) { + return; + } + + // maybe primary node already upgrade pg_control file, sleep and try read in lastest mode again + if (SS_STANDBY_MODE) { + pg_usleep(5000000); /* 5 sec */ + goto loop; + } else { + ereport(PANIC, (errmsg("incorrect checksum in control file"))); + } + } + +loop: + fd = BasicOpenFile(fname, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); + if (fd < 0) { + ereport(FATAL, (errcode_for_file_access(), errmsg("could not open control file \"%s\": %m", fname))); + } + + off_t seekpos = (off_t)BLCKSZ * id; + + if (id == REFORM_CTRL_PAGE) { + len = sizeof(ss_reformer_ctrl_t); + } else { + len = sizeof(ControlFileData); + } + + read_size = (int)BUFFERALIGN(len); + char buffer[read_size] __attribute__((__aligned__(ALIGNOF_BUFFER))); + if (pread(fd, buffer, read_size, seekpos) != read_size) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not read from control file: %m"))); + } + + if (id == REFORM_CTRL_PAGE) { + rc = memcpy_s(&g_instance.dms_cxt.SSReformerControl, len, buffer, len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)&g_instance.dms_cxt.SSReformerControl, offsetof(ss_reformer_ctrl_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, g_instance.dms_cxt.SSReformerControl.crc)) { + if (retry == false) { + ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(FATAL, (errmsg("incorrect checksum in control file"))); + } + } + } else { + ControlFileData* controlFile = NULL; + ControlFileData tempControlFile; + if (updateDmsCtx) { + controlFile = &tempControlFile; + } else { + controlFile = t_thrd.shemem_ptr_cxt.ControlFile; + } + + rc = memcpy_s(controlFile, (size_t)len, buffer, (size_t)len); + securec_check(rc, "", ""); + if (close(fd) < 0) { + ereport(PANIC, (errcode_for_file_access(), errmsg("could not close control file: %m"))); + } + + /* Now check the CRC. */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *)controlFile, offsetof(ControlFileData, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, controlFile->crc)) { + if (retry == false) { + ereport(WARNING, (errmsg("control file \"%s\" contains incorrect checksum, try backup file", fname))); + fname = XLOG_CONTROL_FILE_BAK; + retry = true; + goto loop; + } else { + ereport(FATAL, (errmsg("incorrect checksum in control file"))); + } + } + + if (XLByteLE(g_instance.dms_cxt.ckptRedo, controlFile->checkPointCopy.redo)) { + g_instance.dms_cxt.ckptRedo = controlFile->checkPointCopy.redo; + } + } +} + void SSClearSegCache() { (void)LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); diff --git a/src/gausskernel/process/postmaster/pagerepair.cpp b/src/gausskernel/process/postmaster/pagerepair.cpp index eb0264f7dbe05e5de2e8ecb661e7a1973222df38..ac2815fe9adf77742369a17f6a11cbf6338b5f91 100644 --- a/src/gausskernel/process/postmaster/pagerepair.cpp +++ b/src/gausskernel/process/postmaster/pagerepair.cpp @@ -24,7 +24,6 @@ #include "access/xlog_basic.h" #include "access/xlog_internal.h" #include "access/multi_redo_api.h" -#include "access/extreme_rto/page_redo.h" #include "access/parallel_recovery/page_redo.h" #include "access/parallel_recovery/dispatcher.h" #include "catalog/catalog.h" @@ -858,7 +857,7 @@ void WaitRepalyFinish() { /* file repair finish, need clean the invalid page */ if (IsExtremeRedo()) { - extreme_rto::WaitAllReplayWorkerIdle(); + ExtremeWaitAllReplayWorkerIdle(); } else if (IsParallelRedo()) { parallel_recovery::WaitAllPageWorkersQueueEmpty(); } else { @@ -1088,9 +1087,9 @@ void RenameRepairFile(RepairFileKey *key, bool clear_entry) /* file repair finish, need clean the invalid page */ if (IsExtremeRedo()) { - extreme_rto::DispatchCleanInvalidPageMarkToAllRedoWorker(*key); - extreme_rto::DispatchClosefdMarkToAllRedoWorker(); - extreme_rto::WaitAllReplayWorkerIdle(); + ExtremeDispatchCleanInvalidPageMarkToAllRedoWorker(*key); + ExtremeDispatchClosefdMarkToAllRedoWorker(); + ExtremeWaitAllReplayWorkerIdle(); } else if (IsParallelRedo()) { if (AmStartupProcess()) { ProcTxnWorkLoad(true); @@ -1332,8 +1331,8 @@ void UnlinkOldBadFile(char *path, RepairFileKey key) { /* wait the xlog repaly finish */ if (IsExtremeRedo()) { - extreme_rto::DispatchClosefdMarkToAllRedoWorker(); - extreme_rto::WaitAllReplayWorkerIdle(); + ExtremeDispatchClosefdMarkToAllRedoWorker(); + ExtremeWaitAllReplayWorkerIdle(); } else if (IsParallelRedo()) { parallel_recovery::SendClosefdMarkToAllWorkers(); parallel_recovery::WaitAllPageWorkersQueueEmpty(); diff --git a/src/gausskernel/process/postmaster/postmaster.cpp b/src/gausskernel/process/postmaster/postmaster.cpp index 7896e89bd1499e48af031486a6de00861620be7c..9672942fb727fc91a5cd07bd6d27ec166d0bc27b 100644 --- a/src/gausskernel/process/postmaster/postmaster.cpp +++ b/src/gausskernel/process/postmaster/postmaster.cpp @@ -3027,6 +3027,13 @@ int PostmasterMain(int argc, char* argv[]) } ereport(LOG, (errmsg("[SS reform] Success: node:%d wait for PRIMARY:%d to finish 1st reform", g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); + + while (SS_OFFICIAL_RECOVERY_NODE && SS_CLUSTER_NOT_NORAML) { + pg_usleep(SLEEP_ONE_SEC); + SSReadControlFile(REFORM_CTRL_PAGE); + ereport(WARNING, (errmsg("[on-demand] node%d is last primary node, waiting for on-demand recovery done", + g_instance.attr.attr_storage.dms_attr.instance_id))); + } } } @@ -3063,8 +3070,6 @@ int PostmasterMain(int argc, char* argv[]) } } - - /* * We're ready to rock and roll... */ @@ -4001,7 +4006,8 @@ static int ServerLoop(void) (AutoVacuumingActive() || t_thrd.postmaster_cxt.start_autovac_launcher) && pmState == PM_RUN && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) { + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM && + !SS_IN_ONDEMAND_RECOVERY) { g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); if (g_instance.pid_cxt.AutoVacPID != 0) @@ -6621,29 +6627,6 @@ dms_demote: PostmasterStateMachine(); } -/* - * Reaper -- get current time. - */ -static void GetTimeNowForReaperLog(char* nowTime, int timeLen) -{ - time_t formatTime; - struct timeval current = {0}; - const int tmpBufSize = 32; - char tmpBuf[tmpBufSize] = {0}; - - if (nowTime == NULL || timeLen == 0) { - return; - } - - (void)gettimeofday(¤t, NULL); - formatTime = current.tv_sec; - struct tm* pTime = localtime(&formatTime); - strftime(tmpBuf, sizeof(tmpBuf), "%Y-%m-%d %H:%M:%S", pTime); - - errno_t rc = sprintf_s(nowTime, timeLen - 1, "%s.%ld ", tmpBuf, current.tv_usec / 1000); - securec_check_ss(rc, "\0", "\0"); -} - /* * Reaper -- encap reaper prefix log. */ @@ -6653,7 +6636,7 @@ static char* GetReaperLogPrefix(char* buf, int bufLen) char timeBuf[bufSize] = {0}; errno_t rc; - GetTimeNowForReaperLog(timeBuf, bufSize); + get_time_now(timeBuf, bufSize); rc = memset_s(buf, bufLen, 0, bufLen); securec_check(rc, "\0", "\0"); @@ -6859,7 +6842,8 @@ static void reaper(SIGNAL_ARGS) if (!u_sess->proc_cxt.IsBinaryUpgrade && AutoVacuumingActive() && g_instance.pid_cxt.AutoVacPID == 0 && !dummyStandbyMode && u_sess->attr.attr_common.upgrade_mode != 1 && !g_instance.streaming_dr_cxt.isInSwitchover && - !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM) + !SS_STANDBY_MODE && !SS_PERFORMING_SWITCHOVER && !SS_STANDBY_FAILOVER && !SS_IN_REFORM && + !SS_IN_ONDEMAND_RECOVERY) g_instance.pid_cxt.AutoVacPID = initialize_util_thread(AUTOVACUUM_LAUNCHER); if (SS_REFORM_PARTNER) { @@ -7013,8 +6997,10 @@ static void reaper(SIGNAL_ARGS) GetReaperLogPrefix(logBuf, ReaperLogBufSize), wal_get_role_string(get_cur_mode())); /* at this point we are really open for business */ - write_stderr("%s LOG: database system is ready to accept connections\n", - GetReaperLogPrefix(logBuf, ReaperLogBufSize)); + if (!SS_REPLAYED_BY_ONDEMAND) { + write_stderr("%s LOG: database system is ready to accept connections\n", + GetReaperLogPrefix(logBuf, ReaperLogBufSize)); + } continue; } @@ -10014,12 +10000,12 @@ static void sigusr1_handler(SIGNAL_ARGS) } if (ENABLE_DMS && (mode = CheckSwitchoverSignal())) { - if (SS_NORMAL_STANDBY && pmState == PM_RUN) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_NORMAL_STANDBY && pmState == PM_RUN && !SS_STANDBY_ONDEMAND_RECOVERY) { SSDoSwitchover(); } else { ereport(LOG, (errmsg("Current mode is not NORMAL STANDBY, SS switchover command ignored."))); } - } if ((mode = CheckSwitchoverSignal()) != 0 && WalRcvIsOnline() && DataRcvIsOnline() && diff --git a/src/gausskernel/process/postmaster/startup.cpp b/src/gausskernel/process/postmaster/startup.cpp index f051c7c1a8d1a5544393f727c2bdbeaf5a502627..25eb6b8b8befbfaf6063b86a09f781d85947621c 100755 --- a/src/gausskernel/process/postmaster/startup.cpp +++ b/src/gausskernel/process/postmaster/startup.cpp @@ -37,7 +37,6 @@ #include "gssignal/gs_signal.h" #include "access/parallel_recovery/dispatcher.h" -#include "access/extreme_rto/dispatcher.h" #include "replication/dcf_replication.h" /* Signal handlers */ @@ -404,8 +403,8 @@ bool IsFailoverTriggered(void) if (AmStartupProcess()) { return t_thrd.startup_cxt.failover_triggered; } else { - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == (uint32)extreme_rto::TRIGGER_FAILOVER) { + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == (uint32)TRIGGER_FAILOVER) { return true; } } @@ -417,8 +416,8 @@ bool IsSwitchoverTriggered(void) if (AmStartupProcess()) { return t_thrd.startup_cxt.switchover_triggered; } else { - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == (uint32)extreme_rto::TRIGGER_SWITCHOVER) { + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == (uint32)TRIGGER_SWITCHOVER) { return true; } } @@ -430,8 +429,8 @@ bool IsPrimaryTriggered(void) if (AmStartupProcess()) { return t_thrd.startup_cxt.primary_triggered; } else { - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == (uint32)extreme_rto::TRIGGER_PRIMARY) { + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == (uint32)TRIGGER_PRIMARY) { return true; } } @@ -443,8 +442,8 @@ bool IsStandbyTriggered(void) if (AmStartupProcess()) { return t_thrd.startup_cxt.standby_triggered; } else { - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == (uint32)extreme_rto::TRIGGER_STADNBY) { + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == (uint32)TRIGGER_STADNBY) { return true; } } diff --git a/src/gausskernel/process/tcop/utility.cpp b/src/gausskernel/process/tcop/utility.cpp index 306bde9839f804494bb560b8d49cc11a289e8c3a..7319d69af7206034f9abec2a1dc27e87b7b13fe1 100755 --- a/src/gausskernel/process/tcop/utility.cpp +++ b/src/gausskernel/process/tcop/utility.cpp @@ -643,6 +643,30 @@ void PreventCommandDuringRecovery(const char* cmd_name) errmsg("cannot execute %s during recovery", cmd_name))); } +void PreventCommandDuringSSOndemandRecovery(Node* parseTree) +{ + switch(nodeTag(parseTree)) { + case T_InsertStmt: + case T_DeleteStmt: + case T_UpdateStmt: + case T_SelectStmt: + case T_TransactionStmt: + case T_VariableSetStmt: + case T_VariableShowStmt: + break; + default: + if (SS_IN_ONDEMAND_RECOVERY) { + ereport(ERROR, + (errcode(ERRCODE_RUN_TRANSACTION_DURING_RECOVERY), + errmsg("only support INSERT/UPDATE/DELETE/SELECT/SET/SHOW during SS on-demand recovery, " + "command %d", nodeTag(parseTree)))); + } + break; + } + + return; +} + /* * CheckRestrictedOperation: throw error for hazardous command if we're * inside a security restriction context. diff --git a/src/gausskernel/process/threadpool/knl_instance.cpp b/src/gausskernel/process/threadpool/knl_instance.cpp index 0e2373b3cd53aaa416d12d50b79d4f00583ba00b..8dda0bb12d31aa8379b308af29541ff3bed66377 100755 --- a/src/gausskernel/process/threadpool/knl_instance.cpp +++ b/src/gausskernel/process/threadpool/knl_instance.cpp @@ -190,6 +190,7 @@ static void knl_g_dms_init(knl_g_dms_context *dms_cxt) dms_cxt->SSRecoveryInfo.in_failover = false; dms_cxt->SSRecoveryInfo.in_flushcopy = false; dms_cxt->SSRecoveryInfo.no_backend_left = false; + dms_cxt->SSRecoveryInfo.in_ondemand_recovery = false; dms_cxt->SSRecoveryInfo.startup_need_exit_normally = false; dms_cxt->SSRecoveryInfo.recovery_trapped_in_page_request = false; dms_cxt->log_timezone = NULL; @@ -301,6 +302,8 @@ static void knl_g_parallel_redo_init(knl_g_parallel_redo_context* predo_cxt) rc = memset_s(&predo_cxt->redoCpuBindcontrl, sizeof(RedoCpuBindControl), 0, sizeof(RedoCpuBindControl)); securec_check(rc, "", ""); + + predo_cxt->redoItemHash = NULL; } static void knl_g_parallel_decode_init(knl_g_parallel_decode_context* pdecode_cxt) diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index 13d33ffe96e4508c231d11775ce56d34688483cb..a4d76e0bee8c20eefc47112263b2f7d550d3e7be 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1707,6 +1707,14 @@ static void knl_t_dms_context_init(knl_t_dms_context *dms_cxt) securec_check(rc, "\0", "\0"); dms_cxt->flush_copy_get_page_failed = false; } + +static void knl_t_ondemand_xlog_copy_context_init(knl_t_ondemand_xlog_copy_context *ondemand_xlog_copy_cxt) +{ + ondemand_xlog_copy_cxt->openLogFile = -1; + ondemand_xlog_copy_cxt->openLogSegNo = 0; + ondemand_xlog_copy_cxt->openLogOff = 0; +} + static void knl_t_rc_init(knl_t_rc_context* rc_cxt) { errno_t rc = EOK; @@ -1889,6 +1897,7 @@ void knl_thread_init(knl_thread_role role) knl_index_advisor_init(&t_thrd.index_advisor_cxt); knl_t_sql_patch_init(&t_thrd.sql_patch_cxt); knl_t_dms_context_init(&t_thrd.dms_cxt); + knl_t_ondemand_xlog_copy_context_init(&t_thrd.ondemand_xlog_copy_cxt); KnlTApplyLauncherInit(&t_thrd.applylauncher_cxt); KnlTApplyWorkerInit(&t_thrd.applyworker_cxt); KnlTPublicationInit(&t_thrd.publication_cxt); diff --git a/src/gausskernel/storage/access/common/relfilenode_hash.cpp b/src/gausskernel/storage/access/common/relfilenode_hash.cpp index 3cb0a814ef889c039920184ddfb6d3329670ce3e..093500fe2a1851c114d431ca8af8de35d2e352b3 100644 --- a/src/gausskernel/storage/access/common/relfilenode_hash.cpp +++ b/src/gausskernel/storage/access/common/relfilenode_hash.cpp @@ -24,7 +24,6 @@ #include "storage/buf/buf_internals.h" #include "access/cbmparsexlog.h" #include "replication/reorderbuffer.h" -#include "access/extreme_rto/batch_redo.h" #include "replication/datareceiver.h" #include "pgstat.h" #include "storage/smgr/relfilenode_hash.h" @@ -158,28 +157,6 @@ int CBMPageTagMatch(const void *left, const void *right, Size keysize) return 1; } -uint32 RedoItemTagHash(const void *key, Size keysize) -{ - extreme_rto::RedoItemTag redoItemTag = *(const extreme_rto::RedoItemTag *)key; - redoItemTag.rNode.opt = DefaultFileNodeOpt; - return DatumGetUInt32(hash_any((const unsigned char *)&redoItemTag, (int)keysize)); -} - -int RedoItemTagMatch(const void *left, const void *right, Size keysize) -{ - const extreme_rto::RedoItemTag *leftKey = (const extreme_rto::RedoItemTag *)left; - const extreme_rto::RedoItemTag *rightKey = (const extreme_rto::RedoItemTag *)right; - Assert(keysize == sizeof(extreme_rto::RedoItemTag)); - - /* we just care whether the result is 0 or not */ - if (RelFileNodeEquals(leftKey->rNode, rightKey->rNode) && leftKey->forkNum == rightKey->forkNum && - leftKey->blockNum == rightKey->blockNum) { - return 0; - } - - return 1; -} - uint32 DataWriterRelKeyHash(const void *key, Size keysize) { data_writer_rel_key dataWriterRelKey = *(const data_writer_rel_key *)key; diff --git a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp index 01067ecb20c65d3d8dad21a0daec904f2c63af73..0fb813d5b503a3296c0ef27ceb938b245907640d 100644 --- a/src/gausskernel/storage/access/redo/redo_xlogutils.cpp +++ b/src/gausskernel/storage/access/redo/redo_xlogutils.cpp @@ -52,7 +52,6 @@ #include "commands/dbcommands.h" #include "access/twophase.h" #include "access/redo_common.h" -#include "access/extreme_rto/page_redo.h" #include "ddes/dms/ss_dms_bufmgr.h" THR_LOCAL RedoParseManager *g_parseManager = NULL; @@ -274,7 +273,7 @@ XLogRedoAction XLogCheckBlockDataRedoAction(XLogBlockDataParse *datadecode, Redo if (needRepair && g_instance.pid_cxt.PageRepairPID != 0) { XLogRecPtr pageCurLsn = PageGetLSN(bufferinfo->pageinfo.page); UnlockReleaseBuffer(bufferinfo->buf); - extreme_rto::RecordBadBlockAndPushToRemote(datadecode, LSN_CHECK_FAIL, pageCurLsn, + ExtremeRecordBadBlockAndPushToRemote(datadecode, LSN_CHECK_FAIL, pageCurLsn, bufferinfo->blockinfo.pblk); bufferinfo->buf = InvalidBuffer; bufferinfo->pageinfo = {0}; @@ -914,38 +913,6 @@ void XLogRecSetSegNewPageInfo(XLogBlockSegNewPage *state, char *mainData, Size l state->dataLen = len; } - -static inline bool AtomicCompareExchangeBuffer(volatile Buffer *ptr, Buffer *expected, Buffer newval) -{ - bool ret = false; - Buffer current; - current = __sync_val_compare_and_swap(ptr, *expected, newval); - ret = (current == *expected); - *expected = current; - return ret; -} - -static inline Buffer AtomicReadBuffer(volatile Buffer *ptr) -{ - return *ptr; -} - -static inline void AtomicWriteBuffer(volatile Buffer* ptr, Buffer val) -{ - *ptr = val; -} - -static inline Buffer AtomicExchangeBuffer(volatile Buffer *ptr, Buffer newval) -{ - Buffer old; - while (true) { - old = AtomicReadBuffer(ptr); - if (AtomicCompareExchangeBuffer(ptr, &old, newval)) - break; - } - return old; -} - /* add for batch redo mem manager */ void *XLogMemCtlInit(RedoMemManager *memctl, Size itemsize, int itemnum) { @@ -1164,6 +1131,10 @@ void XLogRedoBufferSetState(RedoBufferManager *buffermanager, RedoMemSlot *buffe void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, InterruptFunc interruptOperte) { + if (SS_IN_ONDEMAND_RECOVERY) { + return OndemandXLogParseBufferInit(parsemanager, buffernum, refOperate, interruptOperte); + } + void *allocdata = NULL; allocdata = XLogMemCtlInit(&(parsemanager->memctl), (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)), buffernum); @@ -1178,6 +1149,11 @@ void XLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOpera void XLogParseBufferDestory(RedoParseManager *parsemanager) { + if (SS_IN_ONDEMAND_RECOVERY) { + OndemandXLogParseBufferDestory(parsemanager); + return; + } + g_parseManager = NULL; if (parsemanager->parsebuffers != NULL) { pfree(parsemanager->parsebuffers); @@ -1189,6 +1165,10 @@ void XLogParseBufferDestory(RedoParseManager *parsemanager) XLogRecParseState *XLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, void *record) { + if (SS_IN_ONDEMAND_RECOVERY) { + return OndemandXLogParseBufferAllocList(parsemanager, blkstatehead, record); + } + RedoMemManager *memctl = &(parsemanager->memctl); RedoMemSlot *allocslot = NULL; ParseBufferDesc *descstate = NULL; @@ -1234,11 +1214,16 @@ XLogRecParseState *XLogParseBufferCopy(XLogRecParseState *srcState) securec_check(rc, "\0", "\0"); newState->isFullSync = srcState->isFullSync; + newState->distributeStatus = srcState->distributeStatus; return newState; } void XLogParseBufferRelease(XLogRecParseState *recordstate) { + if (SS_IN_ONDEMAND_RECOVERY) { + OndemandXLogParseBufferRelease(recordstate); + return; + } RedoMemManager *memctl = &(recordstate->manager->memctl); ParseBufferDesc *descstate = NULL; @@ -1692,7 +1677,9 @@ void ExtremeRtoFlushBuffer(RedoBufferInfo *bufferinfo, bool updateFsm) } else { if (bufferinfo->pageinfo.page != NULL) { BufferDesc *bufDesc = GetBufferDescriptor(bufferinfo->buf - 1); - if (bufferinfo->dirtyflag || XLByteLT(bufDesc->extra->lsn_on_disk, PageGetLSN(bufferinfo->pageinfo.page))) { + /* backends may mark buffer dirty already */ + if (!(bufDesc->state & BM_DIRTY) && + (bufferinfo->dirtyflag || XLByteLT(bufDesc->extra->lsn_on_disk, PageGetLSN(bufferinfo->pageinfo.page)))) { MarkBufferDirty(bufferinfo->buf); if (!bufferinfo->dirtyflag && bufferinfo->blockinfo.forknum == MAIN_FORKNUM) { int mode = WARNING; @@ -1700,8 +1687,8 @@ void ExtremeRtoFlushBuffer(RedoBufferInfo *bufferinfo, bool updateFsm) mode = PANIC; #endif const uint32 shiftSz = 32; - ereport(mode, (errmsg("extreme_rto not mark dirty:lsn %X/%X, lsn_disk %X/%X, \ - lsn_page %X/%X, page %u/%u/%u %u", + ereport(mode, (errmsg("extreme_rto not mark dirty:lsn %X/%X, lsn_disk %X/%X, " + "lsn_page %X/%X, page %u/%u/%u %u", (uint32)(bufferinfo->lsn >> shiftSz), (uint32)(bufferinfo->lsn), (uint32)(bufDesc->extra->lsn_on_disk >> shiftSz), (uint32)(bufDesc->extra->lsn_on_disk), @@ -1765,10 +1752,12 @@ bool XLogBlockRedoForExtremeRTO(XLogRecParseState *redoblocktate, RedoBufferInfo ereport(PANIC, (errmsg("XLogBlockRedoForExtremeRTO: redobuffer checkfailed"))); } if (block_valid <= BLOCK_DATA_FSM_TYPE) { - GetRedoStartTime(redoCost); - Assert(block_valid == g_xlogExtRtoRedoTable[block_valid].block_valid); - g_xlogExtRtoRedoTable[block_valid].xlog_redoextrto(blockhead, blockrecbody, bufferinfo); - CountRedoTime(redoCost); + if (redoaction != BLK_DONE) { + GetRedoStartTime(redoCost); + Assert(block_valid == g_xlogExtRtoRedoTable[block_valid].block_valid); + g_xlogExtRtoRedoTable[block_valid].xlog_redoextrto(blockhead, blockrecbody, bufferinfo); + CountRedoTime(redoCost); + } #ifdef USE_ASSERT_CHECKING if (block_valid != BLOCK_DATA_UNDO_TYPE && !bufferinfo->pageinfo.ignorecheck) { DoRecordCheck(redoblocktate, PageGetLSN(bufferinfo->pageinfo.page), true); @@ -1782,6 +1771,33 @@ bool XLogBlockRedoForExtremeRTO(XLogRecParseState *redoblocktate, RedoBufferInfo return false; } +void XlogBlockRedoForOndemandExtremeRTOQuery(XLogRecParseState *redoBlockState, RedoBufferInfo *bufferInfo) +{ + + XLogBlockHead *blockHead = &redoBlockState->blockparse.blockhead; + void *blockrecBody = &redoBlockState->blockparse.extra_rec; + uint16 blockValid = XLogBlockHeadGetValidInfo(blockHead); + + bool checkValid = XLogBlockRefreshRedoBufferInfo(blockHead, bufferInfo); + if (!checkValid) { + ereport(PANIC, (errmsg("XLogBlockRedoForOndemandExtremeRTOQuery: redobuffer checkfailed"))); + } + if (blockValid <= BLOCK_DATA_FSM_TYPE) { + Assert(blockValid == g_xlogExtRtoRedoTable[blockValid].block_valid); + g_xlogExtRtoRedoTable[blockValid].xlog_redoextrto(blockHead, blockrecBody, bufferInfo); +#ifdef USE_ASSERT_CHECKING + if (blockValid != BLOCK_DATA_UNDO_TYPE) { + DoRecordCheck(redoBlockState, PageGetLSN(bufferInfo->pageinfo.page), true); + } +#endif + } else { + ereport(WARNING, (errmsg("XLogBlockRedoForOndemandExtremeRTOQuery: unsuport type %u, lsn %X/%X", + (uint32)blockValid, + (uint32)(blockHead->end_ptr >> 32), + (uint32)(blockHead->end_ptr)))); + } +} + static const XLogParseBlock g_xlogParseBlockTable[RM_MAX_ID + 1] = { { xlog_redo_parse_to_block, RM_XLOG_ID }, { xact_redo_parse_to_block, RM_XACT_ID }, diff --git a/src/gausskernel/storage/access/transam/CMakeLists.txt b/src/gausskernel/storage/access/transam/CMakeLists.txt index 8c9732a85b0966d622dfa883e8aa14b7821292be..7cda972dae3f0cc73f6706150f88d4436e64d2a4 100755 --- a/src/gausskernel/storage/access/transam/CMakeLists.txt +++ b/src/gausskernel/storage/access/transam/CMakeLists.txt @@ -5,6 +5,7 @@ list(APPEND TGT_transam_SRC ${CMAKE_CURRENT_SOURCE_DIR}/clog.cpp ${CMAKE_CURRENT_SOURCE_DIR}/csnlog.cpp ${CMAKE_CURRENT_SOURCE_DIR}/double_write.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/extreme_rto_redo_api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/single_double_write.cpp ${CMAKE_CURRENT_SOURCE_DIR}/multi_redo_api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/multi_redo_settings.cpp @@ -51,10 +52,12 @@ add_static_objtarget(gausskernel_storage_access_transam TGT_transam_SRC TGT_tran set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/parallel_recovery ${CMAKE_CURRENT_SOURCE_DIR}/extreme_rto + ${CMAKE_CURRENT_SOURCE_DIR}/ondemand_extreme_rto ) add_subdirectory(parallel_recovery) add_subdirectory(extreme_rto) +add_subdirectory(ondemand_extreme_rto) install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/recovery.conf.sample DESTINATION share/postgresql/ diff --git a/src/gausskernel/storage/access/transam/Makefile b/src/gausskernel/storage/access/transam/Makefile index 7e86d13d153b4f3db03ee0f0716728089a8375df..351cb189c825b3c7228ff427d636e44ef7893753 100644 --- a/src/gausskernel/storage/access/transam/Makefile +++ b/src/gausskernel/storage/access/transam/Makefile @@ -12,16 +12,16 @@ endif ifeq ($(enable_multiple_nodes), yes) OBJS = clog.o multixact.o rmgr.o slru.o csnlog.o transam.o twophase.o \ twophase_rmgr.o varsup.o double_write.o single_double_write.o seg_double_write.o redo_statistic.o multi_redo_api.o multi_redo_settings.o\ - xact.o xlog.o xlogfuncs.o \ + xact.o xlog.o xlogfuncs.o extreme_rto_redo_api.o \ xloginsert.o xlogreader.o xlogutils.o cbmparsexlog.o cbmfuncs.o else OBJS = clog.o gtm_single.o multixact.o rmgr.o slru.o csnlog.o transam.o twophase.o \ twophase_rmgr.o varsup.o double_write.o single_double_write.o seg_double_write.o redo_statistic.o multi_redo_api.o multi_redo_settings.o\ - xact.o xlog.o xlogfuncs.o \ + xact.o xlog.o xlogfuncs.o extreme_rto_redo_api.o \ xloginsert.o xlogreader.o xlogutils.o cbmparsexlog.o cbmfuncs.o endif -SUBDIRS = parallel_recovery extreme_rto +SUBDIRS = parallel_recovery extreme_rto ondemand_extreme_rto include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt b/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt index 4350a807c3b446e14d64a1ab190e4fd2ac7171e3..ccb87469aaf370293e8ebaf3e029eb8ec919b699 100755 --- a/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt +++ b/src/gausskernel/storage/access/transam/extreme_rto/CMakeLists.txt @@ -10,7 +10,8 @@ set(TGT_extreme_rto_INC ${LIBCGROUP_INCLUDE_PATH} ${PROJECT_SRC_DIR}/include/libcomm ${ZLIB_INCLUDE_PATH} - ${LIBCURL_INCLUDE_PATH} + ${LIBCURL_INCLUDE_PATH} + ${DCF_INCLUDE_PATH} ) set(extreme_rto_DEF_OPTIONS ${MACRO_OPTIONS}) diff --git a/src/gausskernel/storage/access/transam/extreme_rto/Makefile b/src/gausskernel/storage/access/transam/extreme_rto/Makefile index 46f0518138900ee75f14811ce49697659728f6e1..6b4b4968ec027b9c3970557afeab2372680004bf 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto/Makefile +++ b/src/gausskernel/storage/access/transam/extreme_rto/Makefile @@ -26,6 +26,6 @@ top_builddir = ../../../../../.. include $(top_builddir)/src/Makefile.global OBJS = dispatcher.o page_redo.o posix_semaphore.o redo_item.o \ - spsc_blocking_queue.o txn_redo.o batch_redo.o + spsc_blocking_queue.o txn_redo.o batch_redo.o xlog_read.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp b/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp index 92b94fedca418b31755d4b863b1ad872fdc30168..e45dff0b6fc2bcc4fbc4c1e6c68d1aa18477beb1 100644 --- a/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/batch_redo.cpp @@ -71,6 +71,28 @@ void PRInitRedoItemEntry(RedoItemHashEntry *redoItemHashEntry) redoItemHashEntry->tail = NULL; } +uint32 RedoItemTagHash(const void *key, Size keysize) +{ + RedoItemTag redoItemTag = *(const RedoItemTag *)key; + redoItemTag.rNode.opt = DefaultFileNodeOpt; + return DatumGetUInt32(hash_any((const unsigned char *)&redoItemTag, (int)keysize)); +} + +int RedoItemTagMatch(const void *left, const void *right, Size keysize) +{ + const RedoItemTag *leftKey = (const RedoItemTag *)left; + const RedoItemTag *rightKey = (const RedoItemTag *)right; + Assert(keysize == sizeof(RedoItemTag)); + + /* we just care whether the result is 0 or not */ + if (RelFileNodeEquals(leftKey->rNode, rightKey->rNode) && leftKey->forkNum == rightKey->forkNum && + leftKey->blockNum == rightKey->blockNum) { + return 0; + } + + return 1; +} + HTAB *PRRedoItemHashInitialize(MemoryContext context) { HASHCTL ctl; diff --git a/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp b/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp index 2839b2ce78598d319a853d4c489b59a3ba199ad1..5f458c148599dfe6a2aacdb8f1b0b64266c691f5 100755 --- a/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/dispatcher.cpp @@ -100,7 +100,6 @@ static const int32 MAX_PENDING_STANDBY = 1; static const int32 ITEM_QUQUE_SIZE_RATIO = 5; static const uint32 EXIT_WAIT_DELAY = 100; /* 100 us */ -uint32 g_startupTriggerState = TRIGGER_NORMAL; uint32 g_readManagerTriggerFlag = TRIGGER_NORMAL; static const int invalid_worker_id = -1; @@ -398,7 +397,7 @@ void HandleStartupInterruptsForExtremeRto() Assert(AmStartupProcess()); uint32 newtriggered = (uint32)CheckForSatartupStatus(); - if (newtriggered != extreme_rto::TRIGGER_NORMAL) { + if (newtriggered != TRIGGER_NORMAL) { uint32 triggeredstate = pg_atomic_read_u32(&(g_startupTriggerState)); if (triggeredstate != newtriggered) { ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), @@ -2096,7 +2095,7 @@ RedoWaitInfo redo_get_io_event(int32 event_id) return resultInfo; } -void redo_get_wroker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) +void redo_get_worker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) { PageRedoWorker *redoWorker = NULL; SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); @@ -2134,7 +2133,7 @@ void make_worker_static_info(RedoWorkerTimeCountsInfo *workerCountInfo, PageRedo workerCountInfo->time_cost = redoWorker->timeCostList; } -void redo_get_wroker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) { SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); knl_parallel_redo_state state = g_instance.comm_cxt.predo_cxt.state; diff --git a/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp b/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp index 1262eb4c66388208c6c8a11231eca055ee870ad2..e2b656aaa1bbbc2b11d4d7507f3476ad6a95a1aa 100755 --- a/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp +++ b/src/gausskernel/storage/access/transam/extreme_rto/page_redo.cpp @@ -64,6 +64,7 @@ #include "access/extreme_rto/page_redo.h" #include "access/extreme_rto/dispatcher.h" #include "access/extreme_rto/txn_redo.h" +#include "access/extreme_rto/xlog_read.h" #include "pgstat.h" #include "access/extreme_rto/batch_redo.h" #include "access/multi_redo_api.h" @@ -2195,12 +2196,12 @@ static void HandleExtremeRtoCascadeStandbyPromote(uint32 trigger) pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone, 1); WakeupRecovery(); XLogReadManagerResponseSignal(trigger); - pg_atomic_write_u32(&(extreme_rto::g_startupTriggerState), TRIGGER_NORMAL); + pg_atomic_write_u32(&g_startupTriggerState, TRIGGER_NORMAL); } bool XLogReadManagerCheckSignal() { - uint32 trigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); + uint32 trigger = pg_atomic_read_u32(&g_startupTriggerState); load_server_mode(); if (g_dispatcher->smartShutdown || trigger == TRIGGER_PRIMARY || trigger == TRIGGER_SWITCHOVER || (trigger == TRIGGER_FAILOVER && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) || diff --git a/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp b/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e6e4b4da96dcb329496a60c2b1529d52f266b2c3 --- /dev/null +++ b/src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp @@ -0,0 +1,1008 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * xlog_read.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/extreme_rto/xlog_read.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/extreme_rto/spsc_blocking_queue.h" +#include "access/extreme_rto/dispatcher.h" +#include "access/multi_redo_api.h" +#include "access/xlog.h" +#include "ddes/dms/ss_reform_common.h" +#include "replication/walreceiver.h" +#include "replication/dcf_replication.h" +#include "replication/shared_storage_walreceiver.h" +#include "storage/ipc.h" + +namespace extreme_rto { +static bool DoEarlyExit() +{ + if (g_dispatcher == NULL) { + return false; + } + return g_dispatcher->recoveryStop; +} + +inline static XLogReaderState *ReadNextRecordFromQueue(int emode) +{ + char *errormsg = NULL; + SPSCBlockingQueue *linequeue = g_dispatcher->readLine.readPageThd->queue; + XLogReaderState *xlogreader = NULL; + do { + xlogreader = (XLogReaderState *)SPSCBlockingQueueTake(linequeue); + if (!xlogreader->isDecode) { + XLogRecord *record = (XLogRecord *)xlogreader->readRecordBuf; + GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); + if (!DecodeXLogRecord(xlogreader, record, &errormsg)) { + ereport(emode, + (errmsg("ReadNextRecordFromQueue %X/%X decode error, %s", (uint32)(xlogreader->EndRecPtr >> 32), + (uint32)(xlogreader->EndRecPtr), errormsg))); + + RedoItem *item = GetRedoItemPtr(xlogreader); + + FreeRedoItem(item); + + xlogreader = NULL; + } + CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); + } + + if ((void *)xlogreader == (void *)&(g_GlobalLsnForwarder.record) || + (void *)xlogreader == (void *)&(g_cleanupMark.record)) { + StartupSendFowarder(GetRedoItemPtr(xlogreader)); + xlogreader = NULL; + } + + RedoInterruptCallBack(); + } while (xlogreader == NULL); + + return xlogreader; +} + +XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode) +{ + XLogRecord *record = NULL; + XLogReaderState *xlogreader = ReadNextRecordFromQueue(emode); + + if ((void *)xlogreader != (void *)&(g_redoEndMark.record)) { + *xlogreaderptr = xlogreader; + t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; + t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; + record = (XLogRecord *)xlogreader->readRecordBuf; + } else { + *xlogreaderptr = &g_redoEndMark.record; + if (t_thrd.startup_cxt.shutdown_requested) { + proc_exit(0); + } + } + return record; +} + +void SwitchToReadXlogFromFile(XLogRecPtr pageptr) +{ + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.expectLsn, InvalidXLogRecPtr); + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOPPING); + uint32 workerState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + while (workerState != WORKER_STATE_EXIT && workerState != WORKER_STATE_STOP) { + RedoInterruptCallBack(); + workerState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + } +} + +bool HasReceivedTrigger() +{ + uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); + if (trigger > 0) { + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOPPING); + return true; + } + return false; +} + +// receivedUpto indicate received new datas, but can not read,we should check +bool IsReceivingStatusOk() +{ + WalRcvCtlBlock *walrcb = getCurrentWalRcvCtlBlock(); + uint32 startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (startreadworker == WORKER_STATE_STOP && walrcb == NULL) { + return false; + } + return true; +} + +inline XLogRecPtr CalcExpectLsn(XLogRecPtr recPtr) +{ + XLogRecPtr expectedRecPtr = recPtr; + if (recPtr % XLogSegSize == 0) { + XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD); + } else if (recPtr % XLOG_BLCKSZ == 0) { + XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD); + } + return expectedRecPtr; +} + +int ParallelXLogReadWorkBufRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, TimeLineID *readTLI) +{ + XLogRecPtr RecPtr = targetPagePtr; + uint32 targetPageOff = targetPagePtr % XLogSegSize; + + XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); + XLByteAdvance(RecPtr, reqLen); + + XLogRecPtr expectedRecPtr = CalcExpectLsn(RecPtr); + uint64 waitXLogCount = 0; + const uint64 pushLsnCount = 2; + + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.expectLsn, expectedRecPtr); + for (;;) { + // Check to see if the trigger file exists. If so, update the gaussdb state file. + if (CheckForStandbyTrigger() +#ifndef ENABLE_MULTIPLE_NODES + && IsDCFReadyOrDisabled() +#endif + ) { + SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL); + } + + /* + * If we find an invalid record in the WAL streamed from + * master, something is seriously wrong. There's little + * chance that the problem will just go away, but PANIC is + * not good for availability either, especially in hot + * standby mode. Disconnect, and retry from + * archive/pg_xlog again. The WAL in the archive should be + * identical to what was streamed, so it's unlikely that + * it helps, but one can hope... + */ + if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) { + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.failSource, XLOG_FROM_STREAM); + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + + ResetRtoXlogReadBuf(targetPagePtr); + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * alloted to conflicting queries will decrease. + */ + bool havedata = NewDataIsInBuf(expectedRecPtr); + if (havedata) { + /* just make sure source info is correct... */ + t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; + t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; + waitXLogCount = 0; + if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) { + t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; + } else { + t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff; + } + + /* read from wal writer buffer */ + bool readflag = XLogPageReadForExtRto(xlogreader, targetPagePtr, t_thrd.xlog_cxt.readLen); + if (readflag) { + *readTLI = t_thrd.xlog_cxt.curFileTLI; + return t_thrd.xlog_cxt.readLen; + } else { + if (!IsReceivingStatusOk()) { + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + } + } else { + if (HasReceivedTrigger()) { + return -1; + } + + uint32 waitRedoDone = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone); + if (waitRedoDone == 1 || DoEarlyExit()) { + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + /* + * Wait for more WAL to arrive, or timeout to be reached + */ + WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L); + ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch); + PushToWorkerLsn(waitXLogCount == pushLsnCount); + ++waitXLogCount; + } + + RedoInterruptCallBack(); + } + + return -1; +} + +void WaitReplayFinishAfterReadXlogFileComplete(XLogRecPtr lastValidRecordLsn) +{ + Assert(t_thrd.xlog_cxt.EndRecPtr == lastValidRecordLsn); + XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); + + while (XLByteLT(lastReplayedLsn, lastValidRecordLsn) && !DoEarlyExit()) { + RedoInterruptCallBack(); + const long sleepTime = 100; + pg_usleep(sleepTime); + lastReplayedLsn = GetXLogReplayRecPtr(NULL); + } +} + +int ParallelXLogPageReadFile(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, + TimeLineID *readTLI) +{ + bool randAccess = false; + uint32 targetPageOff; + volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; + XLogRecPtr RecPtr = targetPagePtr; + uint32 ret; +#ifdef USE_ASSERT_CHECKING + XLogSegNo targetSegNo; + + XLByteToSeg(targetPagePtr, targetSegNo); +#endif + targetPageOff = targetPagePtr % XLogSegSize; + + /* + * See if we need to switch to a new segment because the requested record + * is not in the currently open one. + */ + if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + t_thrd.xlog_cxt.readSource = 0; + } + + XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); + XLByteAdvance(RecPtr, reqLen); + +retry: + /* See if we need to retrieve more data */ + if (t_thrd.xlog_cxt.readFile < 0) { + if (t_thrd.xlog_cxt.StandbyMode) { + /* + * In standby mode, wait for the requested record to become + * available, either via restore_command succeeding to restore the + * segment, or via walreceiver having streamed the record. + */ + for (;;) { + RedoInterruptCallBack(); + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + } + /* Reset curFileTLI if random fetch. */ + if (randAccess) { + t_thrd.xlog_cxt.curFileTLI = 0; + } + + /* + * Try to restore the file from archive, or read an + * existing file from pg_xlog. + */ + uint32 sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG; + if (!(sources & ~t_thrd.xlog_cxt.failedSources)) { + /* + * We've exhausted all options for retrieving the + * file. Retry. + */ + t_thrd.xlog_cxt.failedSources = 0; + + /* + * Before we sleep, re-scan for possible new timelines + * if we were requested to recover to the latest + * timeline. + */ + if (t_thrd.xlog_cxt.recoveryTargetIsLatest) { + if (rescanLatestTimeLine()) { + continue; + } + } + + PushToWorkerLsn(true); + WaitReplayFinishAfterReadXlogFileComplete(t_thrd.xlog_cxt.EndRecPtr); + + if (!xlogctl->IsRecoveryDone) { + g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp(); + g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr; + } + + XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); + ereport(LOG, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("ParallelXLogPageReadFile IsRecoveryDone is %s set true," + "ReadRecPtr:%X/%X, EndRecPtr:%X/%X, lastreplayed:%X/%X", + xlogctl->IsRecoveryDone ? "next" : "first", + (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.ReadRecPtr), + (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.EndRecPtr), + (uint32)(lastReplayedLsn >> 32), (uint32)(lastReplayedLsn)))); + + /* + * signal postmaster to update local redo end + * point to gaussdb state file. + */ + if (!xlogctl->IsRecoveryDone) { + SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE); + } + + SpinLockAcquire(&xlogctl->info_lck); + xlogctl->IsRecoveryDone = true; + SpinLockRelease(&xlogctl->info_lck); + if (!(IS_SHARED_STORAGE_MODE) || + pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) { + knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM); + ereport(LOG, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("ParallelXLogPageReadFile set redo finish status," + "ReadRecPtr:%X/%X, EndRecPtr:%X/%X", + (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), + (uint32)(t_thrd.xlog_cxt.ReadRecPtr), (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), + (uint32)(t_thrd.xlog_cxt.EndRecPtr)))); + + /* + * If it hasn't been long since last attempt, sleep 1s to + * avoid busy-waiting. + */ + pg_usleep(150000L); + } + /* + * If primary_conninfo is set, launch walreceiver to + * try to stream the missing WAL, before retrying to + * restore from archive/pg_xlog. + * + * If fetching_ckpt is TRUE, RecPtr points to the + * initial checkpoint location. In that case, we use + * RedoStartLSN as the streaming start position + * instead of RecPtr, so that when we later jump + * backwards to start redo at RedoStartLSN, we will + * have the logs streamed already. + */ + + uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); + if (trigger > 0) { + pg_atomic_write_u32(&g_readManagerTriggerFlag, TRIGGER_NORMAL); + goto triggered; + } + + load_server_mode(); + if (t_thrd.xlog_cxt.PrimaryConnInfo || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) { + t_thrd.xlog_cxt.receivedUpto = 0; + uint32 failSouce = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.failSource); + + if (!(failSouce & XLOG_FROM_STREAM)) { + volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv; + SpinLockAcquire(&walrcv->mutex); + walrcv->receivedUpto = 0; + SpinLockRelease(&walrcv->mutex); + t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; + t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, + XLOG_FROM_STREAM); + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone, 0); + return -1; + } + } + } + /* Don't try to read from a source that just failed */ + sources &= ~t_thrd.xlog_cxt.failedSources; + t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, DEBUG2, sources); + if (t_thrd.xlog_cxt.readFile >= 0) { + break; + } + /* + * Nope, not found in archive and/or pg_xlog.: + */ + t_thrd.xlog_cxt.failedSources |= sources; + + /* + * Check to see if the trigger file exists. Note that we + * do this only after failure, so when you create the + * trigger file, we still finish replaying as much as we + * can from archive and pg_xlog before failover. + */ + uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); + if (trigger > 0) { + pg_atomic_write_u32(&g_readManagerTriggerFlag, TRIGGER_NORMAL); + goto triggered; + } + } + } else { + /* In archive or crash recovery. */ + if (t_thrd.xlog_cxt.readFile < 0) { + uint32 sources; + + /* Reset curFileTLI if random fetch. */ + if (randAccess) { + t_thrd.xlog_cxt.curFileTLI = 0; + } + + sources = XLOG_FROM_PG_XLOG; + if (t_thrd.xlog_cxt.InArchiveRecovery) { + sources |= XLOG_FROM_ARCHIVE; + } + + t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, LOG, sources); + + if (t_thrd.xlog_cxt.readFile < 0) { + return -1; + } + } + } + } + + /* + * At this point, we have the right segment open and if we're streaming we + * know the requested record is in it. + */ + Assert(t_thrd.xlog_cxt.readFile != -1); + + /* + * If the current segment is being streamed from master, calculate how + * much of the current page we have received already. We know the + * requested record has been received, but this is for the benefit of + * future calls, to allow quick exit at the top of this function. + */ + t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; + + /* Read the requested page */ + t_thrd.xlog_cxt.readOff = targetPageOff; + +try_again: + if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { + ereport(emode_for_corrupt_record(LOG, RecPtr), + (errcode_for_file_access(), + errmsg("could not seek in log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; + } + goto next_record_is_invalid; + } + pgstat_report_waitevent(WAIT_EVENT_WAL_READ); + ret = read(t_thrd.xlog_cxt.readFile, xlogreader->readBuf, XLOG_BLCKSZ); + pgstat_report_waitevent(WAIT_EVENT_END); + if (ret != XLOG_BLCKSZ) { + ereport(emode_for_corrupt_record(LOG, RecPtr), + (errcode_for_file_access(), + errmsg("could not read from log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; + } + goto next_record_is_invalid; + } + Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); + Assert(targetPageOff == t_thrd.xlog_cxt.readOff); + Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); + + *readTLI = t_thrd.xlog_cxt.curFileTLI; + + return t_thrd.xlog_cxt.readLen; + +next_record_is_invalid: + t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; + + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + } + t_thrd.xlog_cxt.readFile = -1; + t_thrd.xlog_cxt.readLen = 0; + t_thrd.xlog_cxt.readSource = 0; + + /* In standby-mode, keep trying */ + if (t_thrd.xlog_cxt.StandbyMode) { + goto retry; + } else { + return -1; + } + +triggered: + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + } + t_thrd.xlog_cxt.readFile = -1; + t_thrd.xlog_cxt.readLen = 0; + t_thrd.xlog_cxt.readSource = 0; + t_thrd.xlog_cxt.recoveryTriggered = true; + + return -1; +} + +int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, + TimeLineID *readTLI) +{ + int readLen = -1; + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.targetRecPtr, targetRecPtr); + xlogreader->readBuf = g_dispatcher->rtoXlogBufState.readBuf; + + for (;;) { + uint32 readSource = pg_atomic_read_u32(&(g_recordbuffer->readSource)); + if (readSource & XLOG_FROM_STREAM) { + readLen = ParallelXLogReadWorkBufRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); + } else { + if (ENABLE_DMS && ENABLE_DSS) { + readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, + xlogreader->readBuf, readTLI, NULL); + } else { + readLen = ParallelXLogPageReadFile(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); + } + } + + if (readLen > 0 || t_thrd.xlog_cxt.recoveryTriggered || !t_thrd.xlog_cxt.StandbyMode || DoEarlyExit()) { + return readLen; + } + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(10); + } + + return readLen; +} + +int ParallelReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) +{ + int readLen; + uint32 targetPageOff; + XLogSegNo targetSegNo; + XLogPageHeader hdr; + + Assert((pageptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(pageptr, targetSegNo); + targetPageOff = (pageptr % XLogSegSize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->readSegNo && targetPageOff == state->readOff && reqLen < (int)state->readLen) { + return state->readLen; + } + + /* + * First, read the requested data length, but at least a short page header + * so that we can validate it. + */ + readLen = ParallelXLogPageRead(state, pageptr, Max(reqLen, (int)SizeOfXLogShortPHD), state->currRecPtr, + &state->readPageTLI); + if (readLen < 0) { + goto err; + } + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= (int)SizeOfXLogShortPHD) { + goto err; + } + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader)state->readBuf; + + /* still not enough */ + if (readLen < (int)XLogPageHeaderSize(hdr)) { + readLen = ParallelXLogPageRead(state, pageptr, XLogPageHeaderSize(hdr), state->currRecPtr, &state->readPageTLI); + if (readLen < 0) { + goto err; + } + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!ValidXLogPageHeader(state, pageptr, hdr)) { + goto err; + } + + /* update read state information */ + state->readSegNo = targetSegNo; + state->readOff = targetPageOff; + state->readLen = readLen; + + return readLen; + +err: + XLogReaderInvalReadState(state); + return -1; +} + +XLogRecord *ParallelReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) +{ + XLogRecord *record = NULL; + XLogRecPtr targetPagePtr; + bool randAccess = false; + uint32 len, total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool gotheader = false; + int readOff; + errno_t errorno = EOK; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; + + /* reset error state */ + *errormsg = NULL; + state->errormsg_buf[0] = '\0'; + + if (XLByteEQ(RecPtr, InvalidXLogRecPtr)) { + /* No explicit start point; read the record after the one we just read */ + RecPtr = state->EndRecPtr; + + if (XLByteEQ(state->ReadRecPtr, InvalidXLogRecPtr)) + randAccess = true; + + /* + * If at page start, we must skip over the page header using xrecoff check. + */ + if (0 == RecPtr % XLogSegSize) { + XLByteAdvance(RecPtr, SizeOfXLogLongPHD); + } else if (0 == RecPtr % XLOG_BLCKSZ) { + XLByteAdvance(RecPtr, SizeOfXLogShortPHD); + } + } else { + /* + * Caller supplied a position to start at. + * + * In this case, the passed-in record pointer should already be + * pointing to a valid record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + + state->currRecPtr = RecPtr; + + targetPagePtr = RecPtr - RecPtr % XLOG_BLCKSZ; + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request + * enough byte to cover the whole record header, or at least the part of + * it that fits on the same page. + */ + readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); + if (readOff < 0) { + report_invalid_record(state, "read xlog page failed at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); + if (targetRecOff == 0) { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } else if (targetRecOff < pageHeaderSize) { + report_invalid_record(state, "invalid record offset at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + if ((((XLogPageHeader)state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { + report_invalid_record(state, "contrecord is requested by %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert((int)pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *)(state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = record->xl_tot_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) { + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) + goto err; + gotheader = true; + } else { + /* more validation should be done here */ + if (total_len < SizeOfXLogRecord || total_len >= XLogRecordMaxSize) { + report_invalid_record(state, "invalid record length at %X/%X: wanted %u, got %u", (uint32)(RecPtr >> 32), + (uint32)RecPtr, (uint32)(SizeOfXLogRecord), + total_len); + goto err; + } + gotheader = false; + } + + /* + * Enlarge readRecordBuf as needed. + */ + if (total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)) { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", total_len, (uint32)(RecPtr >> 32), + (uint32)RecPtr); + goto err; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) { + /* Need to reassemble record */ + char *contdata = NULL; + XLogPageHeader pageHeader; + char *buffer = NULL; + uint32 gotlen; + errno_t errorno = EOK; + + readOff = ParallelReadPageInternal(state, targetPagePtr, XLOG_BLCKSZ); + if (readOff < 0) { + goto err; + } + + /* Copy the first fragment of the record from the first page. */ + errorno = memcpy_s(state->readRecordBuf, len, state->readBuf + RecPtr % XLOG_BLCKSZ, len); + securec_check_c(errorno, "\0", "\0"); + buffer = state->readRecordBuf + len; + gotlen = len; + + do { + /* Calculate pointer to beginning of next page */ + XLByteAdvance(targetPagePtr, XLOG_BLCKSZ); + + /* Wait for the next page to become available */ + readOff = ParallelReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + Assert((int)SizeOfXLogShortPHD <= readOff); + + /* Check that the continuation on next page looks valid */ + pageHeader = (XLogPageHeader)state->readBuf; + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { + report_invalid_record(state, "there is no contrecord flag at %X/%X", (uint32)(RecPtr >> 32), + (uint32)RecPtr); + goto err; + } + + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { + report_invalid_record(state, "invalid contrecord length %u at %X/%X", pageHeader->xlp_rem_len, + (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + if (readOff < (int)pageHeaderSize) + readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize); + + Assert((int)pageHeaderSize <= readOff); + + contdata = (char *)state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < (int)(pageHeaderSize + len)) + readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize + len); + + errorno = memcpy_s(buffer, total_len - gotlen, (char *)contdata, len); + securec_check_c(errorno, "", ""); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) { + record = (XLogRecord *)state->readRecordBuf; + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) + goto err; + gotheader = true; + } + } while (gotlen < total_len); + + Assert(gotheader); + + record = (XLogRecord *)state->readRecordBuf; + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); + state->ReadRecPtr = RecPtr; + state->EndRecPtr = targetPagePtr; + XLByteAdvance(state->EndRecPtr, (pageHeaderSize + MAXALIGN(pageHeader->xlp_rem_len))); + } else { + /* Wait for the record data to become available */ + readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff < 0) { + goto err; + } + + /* Record does not cross a page boundary */ + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + state->EndRecPtr = RecPtr; + XLByteAdvance(state->EndRecPtr, MAXALIGN(total_len)); + + state->ReadRecPtr = RecPtr; + errorno = memcpy_s(state->readRecordBuf, total_len, record, total_len); + securec_check_c(errorno, "\0", "\0"); + record = (XLogRecord *)state->readRecordBuf; + } + + /* + * Special processing if it's an XLOG SWITCH record + */ + if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) { + /* Pretend it extends to end of segment */ + state->EndRecPtr += XLogSegSize - 1; + state->EndRecPtr -= state->EndRecPtr % XLogSegSize; + } + + return record; +err: + + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + + return NULL; +} + +XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) +{ + XLogRecord *record = NULL; + + /* This is the first try to read this page. */ + t_thrd.xlog_cxt.failedSources = 0; + for (;;) { + char *errormsg = NULL; + + record = ParallelReadRecord(xlogreader, InvalidXLogRecPtr, &errormsg); + t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; + t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; + g_instance.comm_cxt.predo_cxt.redoPf.read_ptr = t_thrd.xlog_cxt.ReadRecPtr; + + if (record == NULL) { + /* + * We only end up here without a message when XLogPageRead() failed + * - in that case we already logged something. + * In StandbyMode that only happens if we have been triggered, so + * we shouldn't loop anymore in that case. + */ + if (errormsg != NULL) + ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), + (errmsg_internal("%s", errormsg) /* already translated */)); + } + + /* + * Check page TLI is one of the expected values. + */ + else if ((!timeLineInHistory(xlogreader->latestPageTLI, t_thrd.xlog_cxt.expectedTLIs)) && + (!(g_instance.attr.attr_storage.IsRoachStandbyCluster && dummyStandbyMode))) { + char fname[MAXFNAMELEN]; + XLogSegNo targetSegNo; + int32 offset; + errno_t errorno = EOK; + + XLByteToSeg(xlogreader->latestPagePtr, targetSegNo); + offset = xlogreader->latestPagePtr % XLogSegSize; + + errorno = snprintf_s(fname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", xlogreader->readPageTLI, + (uint32)((targetSegNo) / XLogSegmentsPerXLogId), + (uint32)((targetSegNo) % XLogSegmentsPerXLogId)); + securec_check_ss(errorno, "", ""); + + ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), + (errmsg("unexpected timeline ID %u in log segment %s, offset %u", xlogreader->latestPageTLI, fname, + offset))); + record = NULL; + } + + if (record != NULL) { + /* Set up lastest valid record */ + latestValidRecord = t_thrd.xlog_cxt.ReadRecPtr; + latestRecordCrc = record->xl_crc; + latestRecordLen = record->xl_tot_len; + ADD_ABNORMAL_POSITION(9); + /* Great, got a record */ + return record; + } else { + /* No valid record available from this source */ + t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; + + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + } + + /* + * If archive recovery was requested, but we were still doing + * crash recovery, switch to archive recovery and retry using the + * offline archive. We have now replayed all the valid WAL in + * pg_xlog, so we are presumably now consistent. + * + * We require that there's at least some valid WAL present in + * pg_xlog, however (!fetch_ckpt). We could recover using the WAL + * from the archive, even if pg_xlog is completely empty, but we'd + * have no idea how far we'd have to replay to reach consistency. + * So err on the safe side and give up. + */ + if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested) { + t_thrd.xlog_cxt.InArchiveRecovery = true; + if (t_thrd.xlog_cxt.StandbyModeRequested) + t_thrd.xlog_cxt.StandbyMode = true; + /* construct a minrecoverypoint, update LSN */ + UpdateMinrecoveryInAchive(); + /* + * Before we retry, reset lastSourceFailed and currentSource + * so that we will check the archive next. + */ + t_thrd.xlog_cxt.failedSources = 0; + continue; + } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (t_thrd.xlog_cxt.StandbyMode && !t_thrd.xlog_cxt.recoveryTriggered && !DoEarlyExit()) + continue; + else + return NULL; + } + } +} + +} // namespace extreme_rto \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp new file mode 100644 index 0000000000000000000000000000000000000000..12deca658db83f38b352b248d74bef1b9dfb784c --- /dev/null +++ b/src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * extreme_rto_redo_api.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/extreme_rto_redo_api.cpp + * + * ------------------------------------------------------------------------- + */ + +#include +#include + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "utils/guc.h" +#include "access/multi_redo_api.h" + +#include "access/extreme_rto/dispatcher.h" +#include "access/extreme_rto/page_redo.h" +#include "access/extreme_rto/redo_item.h" +#include "access/extreme_rto/posix_semaphore.h" +#include "access/extreme_rto/spsc_blocking_queue.h" +#include "access/extreme_rto/xlog_read.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/page_redo.h" +#include "access/ondemand_extreme_rto/redo_item.h" +#include "access/ondemand_extreme_rto/posix_semaphore.h" +#include "access/ondemand_extreme_rto/spsc_blocking_queue.h" +#include "access/ondemand_extreme_rto/xlog_read.h" + + +ExtremeRtoRedoType g_extreme_rto_type = DEFAULT_EXTREME_RTO; + +typedef struct f_extreme_rto_redo { + void (*wait_all_replay_worker_idle)(void); + void (*dispatch_clean_invalid_page_mark_to_all_redo_worker)(RepairFileKey key); + void (*dispatch_closefd_mark_to_all_redo_worker)(void); + void (*record_bad_block_and_push_to_remote)(XLogBlockDataParse *datadecode, PageErrorType error_type, + XLogRecPtr old_lsn, XLogPhyBlock pblk); + void (*check_committing_csn_list)(void); + XLogRecord *(*read_next_xlog_record)(XLogReaderState **xlogreaderptr, int emode); + void (*extreme_rto_stop_here)(void); + void (*wait_all_redo_worker_queue_empty)(void); + XLogRecPtr (*get_safe_min_check_point)(void); + void (*clear_recovery_thread_hash_tbl)(const RelFileNode &node, ForkNumber forknum, BlockNumber minblkno, + bool segment_shrink); + void (*batch_clear_recovery_thread_hash_tbl)(Oid spcNode, Oid dbNode); + bool (*redo_worker_is_undo_space_worker)(void); + void (*start_recovery_workers)(XLogReaderState *xlogreader, uint32 privateLen); + void (*dispatch_redo_record_to_file)(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); + void (*get_thread_name_if_page_redo_worker)(int argc, char *argv[], char **threadNamePtr); + PGPROC *(*startup_pid_get_proc)(ThreadId pid); + void (*update_standby_state)(HotStandbyState newState); + void (*update_min_recovery_for_trxn_redo_thd)(XLogRecPtr newMinRecoveryPoint); + uint32 (*get_my_page_redo_worker_id_with_lock)(void); + void (*parallel_redo_thread_main)(void); + void (*free_allocated_redo_item)(void); + uint32 (*get_all_worker_count)(void); + void **(*get_xlog_invalid_pages_from_workers)(void); + void (*send_recovery_end_mark_to_workers_and_wait_for_finish)(int code); + RedoWaitInfo (*redo_get_io_event)(int32 event_id); + void (*redo_get_worker_statistic)(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen); + void (*redo_get_worker_time_count)(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); +} f_extreme_rto_redo; + +static const f_extreme_rto_redo extreme_rto_redosw[] = { + /* extreme redo */ + { + extreme_rto::WaitAllReplayWorkerIdle, + extreme_rto::DispatchCleanInvalidPageMarkToAllRedoWorker, + extreme_rto::DispatchClosefdMarkToAllRedoWorker, + extreme_rto::RecordBadBlockAndPushToRemote, + extreme_rto::CheckCommittingCsnList, + extreme_rto::ReadNextXLogRecord, + extreme_rto::ExtremeRtoStopHere, + extreme_rto::WaitAllRedoWorkerQueueEmpty, + extreme_rto::GetSafeMinCheckPoint, + extreme_rto::ClearRecoveryThreadHashTbl, + extreme_rto::BatchClearRecoveryThreadHashTbl, + extreme_rto::RedoWorkerIsUndoSpaceWorker, + extreme_rto::StartRecoveryWorkers, + extreme_rto::DispatchRedoRecordToFile, + extreme_rto::GetThreadNameIfPageRedoWorker, + extreme_rto::StartupPidGetProc, + extreme_rto::UpdateStandbyState, + extreme_rto::UpdateMinRecoveryForTrxnRedoThd, + extreme_rto::GetMyPageRedoWorkerIdWithLock, + extreme_rto::ParallelRedoThreadMain, + extreme_rto::FreeAllocatedRedoItem, + extreme_rto::GetAllWorkerCount, + extreme_rto::GetXLogInvalidPagesFromWorkers, + extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForFinish, + extreme_rto::redo_get_io_event, + extreme_rto::redo_get_worker_statistic, + extreme_rto::redo_get_worker_time_count, + }, + + /* ondemand extreme redo */ + { + ondemand_extreme_rto::WaitAllReplayWorkerIdle, + ondemand_extreme_rto::DispatchCleanInvalidPageMarkToAllRedoWorker, + ondemand_extreme_rto::DispatchClosefdMarkToAllRedoWorker, + NULL, + ondemand_extreme_rto::CheckCommittingCsnList, + ondemand_extreme_rto::ReadNextXLogRecord, + ondemand_extreme_rto::ExtremeRtoStopHere, + ondemand_extreme_rto::WaitAllRedoWorkerQueueEmpty, + ondemand_extreme_rto::GetSafeMinCheckPoint, + NULL, + NULL, + ondemand_extreme_rto::RedoWorkerIsUndoSpaceWorker, + ondemand_extreme_rto::StartRecoveryWorkers, + ondemand_extreme_rto::DispatchRedoRecordToFile, + ondemand_extreme_rto::GetThreadNameIfPageRedoWorker, + ondemand_extreme_rto::StartupPidGetProc, + ondemand_extreme_rto::UpdateStandbyState, + ondemand_extreme_rto::UpdateMinRecoveryForTrxnRedoThd, + ondemand_extreme_rto::GetMyPageRedoWorkerIdWithLock, + ondemand_extreme_rto::ParallelRedoThreadMain, + ondemand_extreme_rto::FreeAllocatedRedoItem, + ondemand_extreme_rto::GetAllWorkerCount, + ondemand_extreme_rto::GetXLogInvalidPagesFromWorkers, + ondemand_extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForFinish, + ondemand_extreme_rto::redo_get_io_event, + ondemand_extreme_rto::redo_get_worker_statistic, + ondemand_extreme_rto::redo_get_worker_time_count, + }, +}; + +void ExtremeWaitAllReplayWorkerIdle() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].wait_all_replay_worker_idle))(); +} + +void ExtremeDispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].dispatch_clean_invalid_page_mark_to_all_redo_worker))(key); +} + +void ExtremeDispatchClosefdMarkToAllRedoWorker() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].dispatch_closefd_mark_to_all_redo_worker))(); +} + +void ExtremeRecordBadBlockAndPushToRemote(XLogBlockDataParse *datadecode, PageErrorType error_type, + XLogRecPtr old_lsn, XLogPhyBlock pblk) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].record_bad_block_and_push_to_remote))(datadecode, error_type, old_lsn, + pblk); +} + +void ExtremeCheckCommittingCsnList() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].check_committing_csn_list))(); +} + +XLogRecord *ExtremeReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode) +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].read_next_xlog_record))(xlogreaderptr, emode); +} + +void ExtremeExtremeRtoStopHere() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].extreme_rto_stop_here))(); +} + +void ExtremeWaitAllRedoWorkerQueueEmpty() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].wait_all_redo_worker_queue_empty))(); +} + +XLogRecPtr ExtremeGetSafeMinCheckPoint() +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].get_safe_min_check_point))(); +} + +void ExtremeClearRecoveryThreadHashTbl(const RelFileNode &node, ForkNumber forknum, BlockNumber minblkno, + bool segment_shrink) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].clear_recovery_thread_hash_tbl))(node, forknum, minblkno, segment_shrink); +} + +void ExtremeBatchClearRecoveryThreadHashTbl(Oid spcNode, Oid dbNode) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].batch_clear_recovery_thread_hash_tbl))(spcNode, dbNode); +} + +bool ExtremeRedoWorkerIsUndoSpaceWorker() +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].redo_worker_is_undo_space_worker))(); +} + +void ExtremeStartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].start_recovery_workers))(xlogreader, privateLen); +} + +void ExtremeDispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].dispatch_redo_record_to_file))(record, expectedTLIs, recordXTime); +} + +void ExtremeGetThreadNameIfPageRedoWorker(int argc, char *argv[], char **threadNamePtr) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].get_thread_name_if_page_redo_worker))(argc, argv, threadNamePtr); +} + +PGPROC *ExtremeStartupPidGetProc(ThreadId pid) +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].startup_pid_get_proc))(pid); +} + +void ExtremeUpdateStandbyState(HotStandbyState newState) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].update_standby_state))(newState); +} + +void ExtremeUpdateMinRecoveryForTrxnRedoThd(XLogRecPtr newMinRecoveryPoint) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].update_min_recovery_for_trxn_redo_thd))(newMinRecoveryPoint); +} + +uint32 ExtremeGetMyPageRedoWorkerIdWithLock() +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].get_my_page_redo_worker_id_with_lock))(); +} + +void ExtremeParallelRedoThreadMain() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].parallel_redo_thread_main))(); +} + +void ExtremeFreeAllocatedRedoItem() +{ + (*(extreme_rto_redosw[g_extreme_rto_type].free_allocated_redo_item))(); +} + +uint32 ExtremeGetAllWorkerCount() +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].get_all_worker_count))(); +} + +void **ExtremeGetXLogInvalidPagesFromWorkers() +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].get_xlog_invalid_pages_from_workers))(); +} + +void ExtremeSendRecoveryEndMarkToWorkersAndWaitForFinish(int code) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].send_recovery_end_mark_to_workers_and_wait_for_finish))(code); +} + +RedoWaitInfo ExtremeRedoGetIoEvent(int32 event_id) +{ + return (*(extreme_rto_redosw[g_extreme_rto_type].redo_get_io_event))(event_id); +} + +void ExtremeRedoGetWorkerStatistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].redo_get_worker_statistic))(realNum, worker, workerLen); +} + +void ExtremeRedoGetWorkerTimeCount(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) +{ + (*(extreme_rto_redosw[g_extreme_rto_type].redo_get_worker_time_count))(workerCountInfoList, realNum); +} + +void ExtremeEndDispatcherContext() +{ + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + (void)MemoryContextSwitchTo(extreme_rto::g_dispatcher->oldCtx); + break; + case ONDEMAND_EXTREME_RTO: + (void)MemoryContextSwitchTo(ondemand_extreme_rto::g_dispatcher->oldCtx); + break; + default: + Assert(true); + } +} + +void ExtremeSetPageRedoWorkerIndex(int index) +{ + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + extreme_rto::g_redoWorker->index = index; + break; + case ONDEMAND_EXTREME_RTO: + ondemand_extreme_rto::g_redoWorker->index = index; + break; + default: + Assert(true); + } +} + +int ExtremeGetPageRedoWorkerIndex() +{ + int result = 0; + + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + result = extreme_rto::g_redoWorker->index; + break; + case ONDEMAND_EXTREME_RTO: + result = ondemand_extreme_rto::g_redoWorker->index; + break; + default: + Assert(true); + } + + return result; +} + +void ExtremeSetMyPageRedoWorker(knl_thread_arg *arg) +{ + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + extreme_rto::g_redoWorker = (extreme_rto::PageRedoWorker *)arg->payload; + break; + case ONDEMAND_EXTREME_RTO: + ondemand_extreme_rto::g_redoWorker = (ondemand_extreme_rto::PageRedoWorker *)arg->payload; + break; + default: + Assert(true); + } +} + +uint32 ExtremeGetMyPageRedoWorkerId() +{ + uint32 result = 0; + + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + result = extreme_rto::g_redoWorker->id; + break; + case ONDEMAND_EXTREME_RTO: + result = ondemand_extreme_rto::g_redoWorker->id; + break; + default: + Assert(true); + } + + return result; +} + +bool IsExtremeMultiThreadRedoRunning() +{ + return (get_real_recovery_parallelism() > 1 && + (extreme_rto::g_dispatcher != 0 || ondemand_extreme_rto::g_dispatcher != 0 )); +} + +bool IsExtremeRtoRunning() +{ + bool result = false; + + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + result = (get_real_recovery_parallelism() > 1 && extreme_rto::g_dispatcher != 0 && + extreme_rto::g_dispatcher->pageLineNum > 0); + break; + case ONDEMAND_EXTREME_RTO: + result = (get_real_recovery_parallelism() > 1 && ondemand_extreme_rto::g_dispatcher != 0 && + ondemand_extreme_rto::g_dispatcher->pageLineNum > 0); + break; + default: + Assert(true); + } + + return result; +} + +bool IsExtremeRtoSmartShutdown() +{ + if (!IsExtremeRtoRunning()) { + return false; + } + + bool result = false; + + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + if (extreme_rto::g_dispatcher->smartShutdown) { + extreme_rto::g_dispatcher->smartShutdown = false; + result = true; + } + break; + case ONDEMAND_EXTREME_RTO: + if (ondemand_extreme_rto::g_dispatcher->smartShutdown) { + ondemand_extreme_rto::g_dispatcher->smartShutdown = false; + result = true; + } + break; + default: + Assert(true); + } + + return result; +} + +void ExtremeRtoRedoManagerSendEndToStartup() +{ + if (!IsExtremeRtoRunning()) { + return; + } + + switch (g_extreme_rto_type) { + case DEFAULT_EXTREME_RTO: + extreme_rto::g_redoEndMark.record.isDecode = true; + extreme_rto::PutRecordToReadQueue((XLogReaderState *)&extreme_rto::g_redoEndMark.record); + break; + case ONDEMAND_EXTREME_RTO: + ondemand_extreme_rto::g_redoEndMark.record.isDecode = true; + ondemand_extreme_rto::PutRecordToReadQueue((XLogReaderState *)&ondemand_extreme_rto::g_redoEndMark.record); + break; + default: + Assert(true); + } +} \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/multi_redo_api.cpp b/src/gausskernel/storage/access/transam/multi_redo_api.cpp index 6ac49cb1e248331f381095377a6e9acb795d4bd5..2d70a75a657454e7f3d5c183581f889438bdeace 100644 --- a/src/gausskernel/storage/access/transam/multi_redo_api.cpp +++ b/src/gausskernel/storage/access/transam/multi_redo_api.cpp @@ -31,19 +31,17 @@ #include "access/multi_redo_settings.h" #include "access/multi_redo_api.h" -#include "access/extreme_rto/dispatcher.h" #include "access/parallel_recovery/dispatcher.h" -#include "access/extreme_rto/page_redo.h" #include "access/parallel_recovery/page_redo.h" #include "access/xlog_internal.h" bool g_supportHotStandby = true; /* don't support consistency view */ - +uint32 g_startupTriggerState = TRIGGER_NORMAL; void StartUpMultiRedo(XLogReaderState *xlogreader, uint32 privateLen) { if (IsExtremeRedo()) { - extreme_rto::StartRecoveryWorkers(xlogreader, privateLen); + ExtremeStartRecoveryWorkers(xlogreader, privateLen); } else if (IsParallelRedo()) { parallel_recovery::StartRecoveryWorkers(xlogreader->ReadRecPtr); } @@ -51,44 +49,14 @@ void StartUpMultiRedo(XLogReaderState *xlogreader, uint32 privateLen) bool IsMultiThreadRedoRunning() { - return (get_real_recovery_parallelism() > 1 && - (extreme_rto::g_dispatcher != 0 || parallel_recovery::g_dispatcher != 0)); -} - -bool IsExtremeRtoRunning() -{ - return (get_real_recovery_parallelism() > 1 && extreme_rto::g_dispatcher != 0 && - extreme_rto::g_dispatcher->pageLineNum > 0); -} - - -bool IsExtremeRtoSmartShutdown() -{ - if (!IsExtremeRtoRunning()) { - return false; - } - - if (extreme_rto::g_dispatcher->smartShutdown) { - extreme_rto::g_dispatcher->smartShutdown =false; - return true; - } - return false; -} - -void ExtremeRtoRedoManagerSendEndToStartup() -{ - if (!IsExtremeRtoRunning()) { - return; - } - - extreme_rto::g_redoEndMark.record.isDecode = true; - extreme_rto::PutRecordToReadQueue((XLogReaderState *)&extreme_rto::g_redoEndMark.record); + return ((get_real_recovery_parallelism() > 1 && parallel_recovery::g_dispatcher != 0) || + IsExtremeMultiThreadRedoRunning()); } void DispatchRedoRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) { if (IsExtremeRedo()) { - extreme_rto::DispatchRedoRecordToFile(record, expectedTLIs, recordXTime); + ExtremeDispatchRedoRecordToFile(record, expectedTLIs, recordXTime); } else if (IsParallelRedo()) { parallel_recovery::DispatchRedoRecordToFile(record, expectedTLIs, recordXTime); } else { @@ -112,7 +80,7 @@ void DispatchRedoRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz void GetThreadNameIfMultiRedo(int argc, char *argv[], char **threadNamePtr) { if (IsExtremeRedo()) { - extreme_rto::GetThreadNameIfPageRedoWorker(argc, argv, threadNamePtr); + ExtremeGetThreadNameIfPageRedoWorker(argc, argv, threadNamePtr); } else if (IsParallelRedo()) { parallel_recovery::GetThreadNameIfPageRedoWorker(argc, argv, threadNamePtr); } @@ -121,7 +89,7 @@ void GetThreadNameIfMultiRedo(int argc, char *argv[], char **threadNamePtr) PGPROC *MultiRedoThreadPidGetProc(ThreadId pid) { if (IsExtremeRedo()) { - return extreme_rto::StartupPidGetProc(pid); + return ExtremeStartupPidGetProc(pid); } else { return parallel_recovery::StartupPidGetProc(pid); } @@ -130,7 +98,7 @@ PGPROC *MultiRedoThreadPidGetProc(ThreadId pid) void MultiRedoUpdateStandbyState(HotStandbyState newState) { if (IsExtremeRedo()) { - extreme_rto::UpdateStandbyState(newState); + ExtremeUpdateStandbyState(newState); } else if (IsParallelRedo()) { parallel_recovery::UpdateStandbyState(newState); } @@ -139,14 +107,14 @@ void MultiRedoUpdateStandbyState(HotStandbyState newState) void MultiRedoUpdateMinRecovery(XLogRecPtr newMinRecoveryPoint) { if (IsExtremeRedo()) { - extreme_rto::UpdateMinRecoveryForTrxnRedoThd(newMinRecoveryPoint); + ExtremeUpdateMinRecoveryForTrxnRedoThd(newMinRecoveryPoint); } } uint32 MultiRedoGetWorkerId() { if (IsExtremeRedo()) { - return extreme_rto::GetMyPageRedoWorkerIdWithLock(); + return ExtremeGetMyPageRedoWorkerIdWithLock(); } else if (IsParallelRedo()) { return parallel_recovery::GetMyPageRedoWorkerOrignId(); } else { @@ -175,7 +143,7 @@ bool IsAllPageWorkerExit() void SetPageRedoWorkerIndex(int index) { if (IsExtremeRedo()) { - extreme_rto::g_redoWorker->index = index; + ExtremeSetPageRedoWorkerIndex(index); } else if (IsParallelRedo()) { parallel_recovery::g_redoWorker->index = index; } @@ -184,7 +152,7 @@ void SetPageRedoWorkerIndex(int index) int GetPageRedoWorkerIndex(int index) { if (IsExtremeRedo()) { - return extreme_rto::g_redoWorker->index; + return ExtremeGetPageRedoWorkerIndex(); } else if (IsParallelRedo()) { return parallel_recovery::g_redoWorker->index; } else { @@ -226,7 +194,7 @@ void ProcTxnWorkLoad(bool force) void SetMyPageRedoWorker(knl_thread_arg *arg) { if (IsExtremeRedo()) { - extreme_rto::g_redoWorker = (extreme_rto::PageRedoWorker *)arg->payload; + ExtremeSetMyPageRedoWorker(arg); } else if (IsParallelRedo()) { parallel_recovery::g_redoWorker = (parallel_recovery::PageRedoWorker *)arg->payload; } @@ -236,7 +204,7 @@ void SetMyPageRedoWorker(knl_thread_arg *arg) uint32 GetMyPageRedoWorkerId() { if (IsExtremeRedo()) { - return extreme_rto::g_redoWorker->id; + return ExtremeGetMyPageRedoWorkerId(); } else if (IsParallelRedo()) { return parallel_recovery::g_redoWorker->id; } else { @@ -249,7 +217,7 @@ void MultiRedoMain() pgstat_report_appname("PageRedo"); pgstat_report_activity(STATE_IDLE, NULL); if (IsExtremeRedo()) { - extreme_rto::ParallelRedoThreadMain(); + ExtremeParallelRedoThreadMain(); } else if (IsParallelRedo()) { parallel_recovery::PageRedoWorkerMain(); } else { @@ -260,7 +228,7 @@ void MultiRedoMain() void EndDispatcherContext() { if (IsExtremeRedo()) { - (void)MemoryContextSwitchTo(extreme_rto::g_dispatcher->oldCtx); + ExtremeEndDispatcherContext(); } else if (IsParallelRedo()) { (void)MemoryContextSwitchTo(parallel_recovery::g_dispatcher->oldCtx); @@ -275,7 +243,7 @@ void SwitchToDispatcherContext() void FreeAllocatedRedoItem() { if (IsExtremeRedo()) { - extreme_rto::FreeAllocatedRedoItem(); + ExtremeFreeAllocatedRedoItem(); } else if (IsParallelRedo()) { parallel_recovery::FreeAllocatedRedoItem(); @@ -285,7 +253,7 @@ void FreeAllocatedRedoItem() uint32 GetRedoWorkerCount() { if (IsExtremeRedo()) { - return extreme_rto::GetAllWorkerCount(); + return ExtremeGetAllWorkerCount(); } else if (IsParallelRedo()) { return parallel_recovery::GetPageWorkerCount(); @@ -297,7 +265,7 @@ uint32 GetRedoWorkerCount() void **GetXLogInvalidPagesFromWorkers() { if (IsExtremeRedo()) { - return extreme_rto::GetXLogInvalidPagesFromWorkers(); + return ExtremeGetXLogInvalidPagesFromWorkers(); } else if (IsParallelRedo()) { return parallel_recovery::GetXLogInvalidPagesFromWorkers(); @@ -309,7 +277,7 @@ void **GetXLogInvalidPagesFromWorkers() void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code) { if (IsExtremeRedo()) { - return extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForFinish(code); + return ExtremeSendRecoveryEndMarkToWorkersAndWaitForFinish(code); } else if (IsParallelRedo()) { return parallel_recovery::SendRecoveryEndMarkToWorkersAndWaitForFinish(code); @@ -319,27 +287,27 @@ void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code) RedoWaitInfo GetRedoIoEvent(int32 event_id) { if (IsExtremeRedo()) { - return extreme_rto::redo_get_io_event(event_id); + return ExtremeRedoGetIoEvent(event_id); } else { return parallel_recovery::redo_get_io_event(event_id); } } -void GetRedoWrokerStatistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) +void GetRedoWorkerStatistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) { if (IsExtremeRedo()) { - return extreme_rto::redo_get_wroker_statistic(realNum, worker, workerLen); + return ExtremeRedoGetWorkerStatistic(realNum, worker, workerLen); } else { - return parallel_recovery::redo_get_wroker_statistic(realNum, worker, workerLen); + return parallel_recovery::redo_get_worker_statistic(realNum, worker, workerLen); } } void GetRedoWorkerTimeCount(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) { if (IsExtremeRedo()) { - extreme_rto::redo_get_wroker_time_count(workerCountInfoList, realNum); + ExtremeRedoGetWorkerTimeCount(workerCountInfoList, realNum); } else if (IsParallelRedo()) { - parallel_recovery::redo_get_wroker_time_count(workerCountInfoList, realNum); + parallel_recovery::redo_get_worker_time_count(workerCountInfoList, realNum); } else { *realNum = 0; } diff --git a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp index 6ed92595e3612f26cc6d7f48bcbe33fa345dbe2d..8cb19710548181ac0b50e9b3e7bdf3f1c59dd72d 100644 --- a/src/gausskernel/storage/access/transam/multi_redo_settings.cpp +++ b/src/gausskernel/storage/access/transam/multi_redo_settings.cpp @@ -44,6 +44,8 @@ void ConfigRecoveryParallelism() if (g_instance.attr.attr_storage.recovery_parse_workers > 1) { g_instance.comm_cxt.predo_cxt.redoType = EXTREME_REDO; + g_extreme_rto_type = g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery ? + ONDEMAND_EXTREME_RTO : DEFAULT_EXTREME_RTO; g_instance.attr.attr_storage.batch_redo_num = g_instance.attr.attr_storage.recovery_parse_workers; uint32 total_recovery_parallelism = g_instance.attr.attr_storage.batch_redo_num * 2 + g_instance.attr.attr_storage.recovery_redo_workers_per_paser_worker * diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e2fea1445e2f7d46bf21d6875a524a5843d14e --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/CMakeLists.txt @@ -0,0 +1,22 @@ +#This is the main CMAKE for build bin. +AUX_SOURCE_DIRECTORY(${CMAKE_CURRENT_SOURCE_DIR} TGT_ondemand_extreme_rto_SRC) + +set(TGT_ondemand_extreme_rto_INC + ${PROJECT_SRC_DIR}/gausskernel/cbb/communication + ${PROJECT_SRC_DIR}/include/iprange + ${PROJECT_SRC_DIR}/include/libcomm + ${PROJECT_SRC_DIR}/include + ${PROJECT_SRC_DIR}/lib/gstrace + ${LIBCGROUP_INCLUDE_PATH} + ${PROJECT_SRC_DIR}/include/libcomm + ${ZLIB_INCLUDE_PATH} + ${LIBCURL_INCLUDE_PATH} + ${DCF_INCLUDE_PATH} +) + +set(ondemand_extreme_rto_DEF_OPTIONS ${MACRO_OPTIONS}) +set(ondemand_extreme_rto_COMPILE_OPTIONS ${OPTIMIZE_OPTIONS} ${OS_OPTIONS} ${PROTECT_OPTIONS} ${WARNING_OPTIONS} ${BIN_SECURE_OPTIONS} ${CHECK_OPTIONS}) +set(ondemand_extreme_rto_LINK_OPTIONS ${BIN_LINK_OPTIONS}) +add_static_objtarget(gausskernel_storage_access_transam_ondemand_extreme_rto TGT_ondemand_extreme_rto_SRC TGT_ondemand_extreme_rto_INC "${ondemand_extreme_rto_DEF_OPTIONS}" + "${ondemand_extreme_rto_COMPILE_OPTIONS}" "${ondemand_extreme_rto_LINK_OPTIONS}") + diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0d3a32716088cb18a5c09c6a8c45f94abaeee5f7 --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile @@ -0,0 +1,31 @@ +# +# Copyright (c) 2020 Huawei Technologies Co.,Ltd. +# +# openGauss is licensed under Mulan PSL v2. +# You can use this software according to the terms and conditions of the Mulan PSL v2. +# You may obtain a copy of Mulan PSL v2 at: +# +# http://license.coscl.org.cn/MulanPSL2 +# +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, +# EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, +# MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. +# See the Mulan PSL v2 for more details. +# --------------------------------------------------------------------------------------- +# +# Makefile +# Makefile for access/transam/ondemand_extreme_rto +# +# IDENTIFICATION +# src/gausskernel/storage/access/transam/ondemand_extreme_rto/Makefile +# +# --------------------------------------------------------------------------------------- + +subdir = src/gausskernel/storage/access/transam/ondemand_extreme_rto +top_builddir = ../../../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = dispatcher.o page_redo.o posix_semaphore.o redo_item.o \ + spsc_blocking_queue.o txn_redo.o batch_redo.o xlog_read.o redo_utils.o + +include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d5856c80ef09af02f1df08cc13efc8be122b8d6 --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * batch_redo.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/batch_redo.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogreader.h" +#include "access/xlogproc.h" +#include "access/visibilitymap.h" +#include "catalog/storage_xlog.h" +#include "commands/dbcommands.h" +#include "commands/tablespace.h" +#include "storage/freespace.h" +#include "storage/smgr/relfilenode_hash.h" +#include "utils/relmapper.h" + +#include "access/ondemand_extreme_rto/batch_redo.h" +#include "access/ondemand_extreme_rto/redo_item.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/page_redo.h" + +#include "access/xlogproc.h" + +extern uint32 hashquickany(uint32 seed, register const unsigned char *data, register int len); + +namespace ondemand_extreme_rto { +static inline void PRXLogRecGetBlockTag(XLogRecParseState *recordBlockState, RelFileNode *rnode, BlockNumber *blknum, + ForkNumber *forknum) +{ + XLogBlockParse *blockparse = &(recordBlockState->blockparse); + + if (rnode != NULL) { + rnode->dbNode = blockparse->blockhead.dbNode; + rnode->relNode = blockparse->blockhead.relNode; + rnode->spcNode = blockparse->blockhead.spcNode; + rnode->bucketNode = blockparse->blockhead.bucketNode; + rnode->opt = blockparse->blockhead.opt; + } + if (blknum != NULL) { + *blknum = blockparse->blockhead.blkno; + } + if (forknum != NULL) { + *forknum = blockparse->blockhead.forknum; + } +} + +uint32 XlogTrackTableHashCode(RedoItemTag *tagPtr) +{ + return hashquickany(0xFFFFFFFF, (unsigned char *)tagPtr, sizeof(RedoItemTag)); +} + +void PRInitRedoItemEntry(RedoItemHashEntry *redoItemHashEntry) +{ + redoItemHashEntry->redoItemNum = 0; + redoItemHashEntry->head = NULL; + redoItemHashEntry->tail = NULL; + redoItemHashEntry->redoDone = false; +} + +uint32 RedoItemTagHash(const void *key, Size keysize) +{ + RedoItemTag redoItemTag = *(const RedoItemTag *)key; + redoItemTag.rNode.opt = DefaultFileNodeOpt; + return DatumGetUInt32(hash_any((const unsigned char *)&redoItemTag, (int)keysize)); +} + +int RedoItemTagMatch(const void *left, const void *right, Size keysize) +{ + const RedoItemTag *leftKey = (const RedoItemTag *)left; + const RedoItemTag *rightKey = (const RedoItemTag *)right; + Assert(keysize == sizeof(RedoItemTag)); + + /* we just care whether the result is 0 or not */ + if (RelFileNodeEquals(leftKey->rNode, rightKey->rNode) && leftKey->forkNum == rightKey->forkNum && + leftKey->blockNum == rightKey->blockNum) { + return 0; + } + + return 1; +} + +HTAB **PRRedoItemHashInitialize(MemoryContext context) +{ + HASHCTL ctl; + int batchNum = get_batch_redo_num(); + HTAB **hTab = (HTAB **)MemoryContextAllocZero(context, batchNum * sizeof(HTAB *)); + + /* + * create hashtable that indexes the redo items + */ + errno_t rc = memset_s(&ctl, sizeof(ctl), 0, sizeof(ctl)); + securec_check(rc, "\0", "\0"); + ctl.hcxt = context; + ctl.keysize = sizeof(RedoItemTag); + ctl.entrysize = sizeof(RedoItemHashEntry); + ctl.hash = RedoItemTagHash; + ctl.match = RedoItemTagMatch; + for (int i = 0; i < batchNum; i++) { + hTab[i] = hash_create("Redo item hash by relfilenode and blocknum", INITredoItemHashSIZE, &ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT | HASH_SHRCTX | HASH_COMPARE); + } + + return hTab; +} + +void PRRegisterBlockInsertToList(RedoItemHashEntry *redoItemHashEntry, XLogRecParseState *record) +{ + ReferenceRecParseState(record); + if (redoItemHashEntry->tail != NULL) { + redoItemHashEntry->tail->nextrecord = record; + redoItemHashEntry->tail = record; + } else { + redoItemHashEntry->tail = record; + redoItemHashEntry->head = record; + } + record->nextrecord = NULL; + redoItemHashEntry->redoItemNum++; +} + +void PRRegisterBlockChangeExtended(XLogRecParseState *recordBlockState, const RelFileNode rNode, ForkNumber forkNum, + BlockNumber blkNo, HTAB *redoItemHash) +{ + RedoItemTag redoItemTag; + RedoItemHashEntry *redoItemHashEntry = NULL; + bool found = true; + + INIT_REDO_ITEM_TAG(redoItemTag, rNode, forkNum, blkNo); + + redoItemHashEntry = (RedoItemHashEntry *)hash_search(redoItemHash, (void *)&redoItemTag, HASH_ENTER, &found); + if (redoItemHashEntry == NULL) { + ereport(ERROR, (errcode(ERRCODE_FETCH_DATA_FAILED), + errmsg("could not find or create redo item entry: rel %u/%u/%u " + "forknum %d blkno %u", + rNode.spcNode, rNode.dbNode, rNode.relNode, forkNum, blkNo))); + } + + if (!found) { + PRInitRedoItemEntry(redoItemHashEntry); + } + PRRegisterBlockInsertToList(redoItemHashEntry, recordBlockState); +} + +void PRTrackRemoveEntry(HTAB *hashMap, RedoItemHashEntry *entry) +{ + XLogRecParseState *recordBlockState = entry->head; +#ifdef USE_ASSERT_CHECKING + XLogRecParseState *nextBlockState = entry->head; + while (nextBlockState != NULL) { + XLogRecParseState *prev = nextBlockState; + nextBlockState = (XLogRecParseState *)(nextBlockState->nextrecord); + + if (prev->refrecord != NULL) { + DoRecordCheck(prev, InvalidXLogRecPtr, false); + } + + ereport(LOG, (errmsg("PRTrackRemoveEntry:record(%X/%X) relation %u/%u/%u forknum %u blocknum %u dropped(%p)", + (uint32)(prev->blockparse.blockhead.end_ptr >> 32), + (uint32)(prev->blockparse.blockhead.end_ptr), prev->blockparse.blockhead.spcNode, + prev->blockparse.blockhead.dbNode, prev->blockparse.blockhead.relNode, + prev->blockparse.blockhead.forknum, prev->blockparse.blockhead.blkno, prev->refrecord))); + } + +#endif + XLogBlockParseStateRelease(recordBlockState); + + if (hash_search(hashMap, entry, HASH_REMOVE, NULL) == NULL) { + ereport(ERROR, (errmsg("PRTrackRemoveEntry:Redo item hash table corrupted"))); + } +} + +void PRTrackRelTruncate(HTAB *hashMap, const RelFileNode rNode, ForkNumber forkNum, BlockNumber blkNo) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + hash_seq_init(&status, hashMap); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + if (RelFileNodeEquals(redoItemEntry->redoItemTag.rNode, rNode) && + redoItemEntry->redoItemTag.forkNum == forkNum && (redoItemEntry->redoItemTag.blockNum >= blkNo)) { + PRTrackRemoveEntry(hashMap, redoItemEntry); + } + } +} + +void PRTrackTableSpaceDrop(XLogRecParseState *recordBlockState, HTAB *hashMap) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + hash_seq_init(&status, hashMap); + + RelFileNode rNode; + PRXLogRecGetBlockTag(recordBlockState, &rNode, NULL, NULL); +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear table space %u record", + (uint32)(recordBlockState->blockparse.blockhead.end_ptr >> 32), + (uint32)(recordBlockState->blockparse.blockhead.end_ptr), rNode.spcNode))); +#endif + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + if (redoItemEntry->redoItemTag.rNode.spcNode == rNode.spcNode) { + PRTrackRemoveEntry(hashMap, redoItemEntry); + } + } + XLogBlockParseStateRelease(recordBlockState); +} + +void PRTrackDatabaseDrop(XLogRecParseState *recordBlockState, HTAB *hashMap) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + hash_seq_init(&status, hashMap); + + RelFileNode rNode; + PRXLogRecGetBlockTag(recordBlockState, &rNode, NULL, NULL); +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear db %u/%u record", + (uint32)(recordBlockState->blockparse.blockhead.end_ptr >> 32), + (uint32)(recordBlockState->blockparse.blockhead.end_ptr), rNode.spcNode, rNode.dbNode))); +#endif + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + if (redoItemEntry->redoItemTag.rNode.spcNode == rNode.spcNode && + redoItemEntry->redoItemTag.rNode.dbNode == rNode.dbNode) { + PRTrackRemoveEntry(hashMap, redoItemEntry); + } + } + XLogBlockParseStateRelease(recordBlockState); +} + +void PRTrackDropFiles(HTAB *redoItemHash, XLogBlockDdlParse *ddlParse, XLogRecPtr lsn) +{ + ColFileNodeRel *xnodes = (ColFileNodeRel *)ddlParse->mainData; + bool compress = ddlParse->compress; + for (int i = 0; i < ddlParse->rels; ++i) { + ColFileNode colFileNode; + if (compress) { + ColFileNode *colFileNodeRel = ((ColFileNode *)(void *)xnodes) + i; + ColFileNodeFullCopy(&colFileNode, colFileNodeRel); + } else { + ColFileNodeRel *colFileNodeRel = xnodes + i; + ColFileNodeCopy(&colFileNode, colFileNodeRel); + } + + if (!IsValidColForkNum(colFileNode.forknum)) { + for (int i = 0; i < MAX_FORKNUM; ++i) + PRTrackRelTruncate(redoItemHash, colFileNode.filenode, i, 0); + } else { + PRTrackRelTruncate(redoItemHash, colFileNode.filenode, colFileNode.forknum, 0); + } +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("PRTrackRelTruncate(drop):(%X/%X)clear relation %u/%u/%u forknum %d record", + (uint32)(lsn >> 32), (uint32)(lsn), colFileNode.filenode.spcNode, colFileNode.filenode.dbNode, + colFileNode.filenode.relNode, colFileNode.forknum))); +#endif + } +} + +void PRTrackRelStorageDrop(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +{ + XLogBlockParse *blockparse = &(recordBlockState->blockparse); + XLogBlockDdlParse *ddlParse = NULL; + XLogBlockParseGetDdlParse(recordBlockState, ddlParse); + + if (ddlParse->blockddltype == BLOCK_DDL_TRUNCATE_RELNODE) { + RelFileNode rNode; + rNode.spcNode = blockparse->blockhead.spcNode; + rNode.dbNode = blockparse->blockhead.dbNode; + rNode.relNode = blockparse->blockhead.relNode; + rNode.bucketNode = blockparse->blockhead.bucketNode; + rNode.opt = blockparse->blockhead.opt; +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("PRTrackRelTruncate:(%X/%X)clear relation %u/%u/%u forknum %u record", + (uint32)(blockparse->blockhead.end_ptr >> 32), (uint32)(blockparse->blockhead.end_ptr), rNode.spcNode, + rNode.dbNode, rNode.relNode, blockparse->blockhead.forknum))); +#endif + PRTrackRelTruncate(redoItemHash, rNode, blockparse->blockhead.forknum, blockparse->blockhead.blkno); + } else { + PRTrackDropFiles(redoItemHash, ddlParse, blockparse->blockhead.end_ptr); + } + + XLogBlockParseStateRelease(recordBlockState); +} + +// Get relfile node fork num blockNum +void PRTrackRelPageModification(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +{ + RelFileNode relnode; + ForkNumber forkNum; + BlockNumber blkNo; + + PRXLogRecGetBlockTag(recordBlockState, &relnode, &blkNo, &forkNum); + + PRRegisterBlockChangeExtended(recordBlockState, relnode, forkNum, blkNo, redoItemHash); +} + +/** + for block state, put it in to hash +*/ +void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +{ + Assert(recordBlockState->blockparse.blockhead.block_valid < BLOCK_DATA_DDL_TYPE); + PRTrackRelPageModification(recordBlockState, redoItemHash); +} + +/** + others state, clear related block state(including release), release it +*/ +void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash) +{ + Assert(recordBlockState != NULL); + Assert(redoItemHash != NULL); + XLogBlockParse *blockparse = &(recordBlockState->blockparse); + if (blockparse->blockhead.block_valid == BLOCK_DATA_DDL_TYPE) { + PRTrackRelStorageDrop(recordBlockState, redoItemHash); + } else if (blockparse->blockhead.block_valid == BLOCK_DATA_DROP_DATABASE_TYPE) { + PRTrackDatabaseDrop(recordBlockState, redoItemHash); + } else if (blockparse->blockhead.block_valid == BLOCK_DATA_DROP_TBLSPC_TYPE) { + PRTrackTableSpaceDrop(recordBlockState, redoItemHash); + } else { + const uint32 rightShiftSize = 32; + ereport(WARNING, + (errmsg("PRTrackClearBlock:(%X/%X) not identified %u/%u/%u forknum %d record", + (uint32)(blockparse->blockhead.end_ptr >> rightShiftSize), + (uint32)(blockparse->blockhead.end_ptr), blockparse->blockhead.spcNode, + blockparse->blockhead.dbNode, blockparse->blockhead.relNode, blockparse->blockhead.forknum))); + XLogBlockParseStateRelease(recordBlockState); + } +} + +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e50af152afd4a04a0f9118dccf6c329547fe3c9b --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp @@ -0,0 +1,2392 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * dispatcher.cpp + * Parallel recovery has a centralized log dispatcher which runs inside + * the StartupProcess. The dispatcher is responsible for managing the + * life cycle of PageRedoWorkers and the TxnRedoWorker, analyzing log + * records and dispatching them to workers for processing. + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/dispatcher.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "postmaster/startup.h" +#include "access/clog.h" +#include "access/xact.h" +#include "access/xlog_internal.h" +#include "access/nbtree.h" +#include "access/ubtree.h" +#include "access/hash_xlog.h" +#include "access/xlogreader.h" +#include "access/gist_private.h" +#include "access/multixact.h" +#include "access/spgist_private.h" +#include "access/gin_private.h" +#include "access/xlogutils.h" +#include "access/gin.h" + +#include "catalog/storage_xlog.h" +#include "storage/buf/buf_internals.h" +#include "storage/ipc.h" +#include "storage/standby.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/palloc.h" +#include "utils/guc.h" +#include "utils/relmapper.h" + +#include "portability/instr_time.h" + +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/page_redo.h" +#include "access/multi_redo_api.h" + +#include "access/ondemand_extreme_rto/txn_redo.h" +#include "access/ondemand_extreme_rto/spsc_blocking_queue.h" +#include "access/ondemand_extreme_rto/redo_item.h" +#include "access/ondemand_extreme_rto/batch_redo.h" + +#include "catalog/storage.h" +#include +#include "utils/memutils.h" + +#include "commands/dbcommands.h" +#include "commands/tablespace.h" +#include "commands/sequence.h" + +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "gssignal/gs_signal.h" +#include "utils/atomic.h" +#include "pgstat.h" +#include "ddes/dms/ss_reform_common.h" + +#ifdef PGXC +#include "pgxc/pgxc.h" +#endif + +#ifdef ENABLE_UT +#include "utils/utesteventutil.h" +#define STATIC +#else +#define STATIC static +#endif + +extern THR_LOCAL bool redo_oldversion_xlog; + +namespace ondemand_extreme_rto { +LogDispatcher *g_dispatcher = NULL; + +static const int XLOG_INFO_SHIFT_SIZE = 4; /* xlog info flag shift size */ + +static const int32 MAX_PENDING = 1; +static const int32 MAX_PENDING_STANDBY = 1; +static const int32 ITEM_QUQUE_SIZE_RATIO = 5; + +static const uint32 EXIT_WAIT_DELAY = 100; /* 100 us */ +uint32 g_readManagerTriggerFlag = TRIGGER_NORMAL; +static const int invalid_worker_id = -1; + +static const int UNDO_START_BLK = 1; +static const int UHEAP_UPDATE_UNDO_START_BLK = 2; + +typedef void *(*GetStateFunc)(PageRedoWorker *worker); + +static void AddSlotToPLSet(uint32); +static void **CollectStatesFromWorkers(GetStateFunc); +static void GetSlotIds(XLogReaderState *record); +static void GetUndoSlotIds(XLogReaderState *record); +STATIC LogDispatcher *CreateDispatcher(); +static void SSDestroyRecoveryWorkers(); + +static void DispatchRecordWithPages(XLogReaderState *, List *); +static void DispatchRecordWithoutPage(XLogReaderState *, List *); +static void DispatchTxnRecord(XLogReaderState *, List *); +static void StartPageRedoWorkers(uint32); +static void StopRecoveryWorkers(int, Datum); +static bool StandbyWillChangeStandbyState(const XLogReaderState *); +static void DispatchToSpecPageWorker(XLogReaderState *record, List *expectedTLIs); + +static bool DispatchXLogRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchXactRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchSmgrRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchCompresseShrinkRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchDataBaseRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchTableSpaceRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchMultiXactRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchRelMapRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchStandbyRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchHeap2Record(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchHeapRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchSeqRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchGinRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchGistRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchSpgistRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchRepSlotRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchHeap3Record(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchDefaultRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchBarrierRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +#ifdef ENABLE_MOT +static bool DispatchMotRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +#endif +static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchSegpageSmgrRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchRepOriginRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); + +static bool DispatchUBTreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +static bool DispatchUBTree2Record(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static bool RmgrRecordInfoValid(XLogReaderState *record, uint8 minInfo, uint8 maxInfo); +static bool RmgrGistRecordInfoValid(XLogReaderState *record, uint8 minInfo, uint8 maxInfo); + +/* Ustore table */ +static bool DispatchUHeapRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static bool DispatchUHeap2Record(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static bool DispatchUHeapUndoRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static bool DispatchUndoActionRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static bool DispatchRollbackFinishRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); +static inline uint32 GetUndoSpaceWorkerId(int zid); + +static XLogReaderState *GetXlogReader(XLogReaderState *readerState); +void CopyDataFromOldReader(XLogReaderState *newReaderState, const XLogReaderState *oldReaderState); +void SendSingalToPageWorker(int signal); + +/* dispatchTable must consistent with RmgrTable */ +static const RmgrDispatchData g_dispatchTable[RM_MAX_ID + 1] = { + { DispatchXLogRecord, RmgrRecordInfoValid, RM_XLOG_ID, XLOG_CHECKPOINT_SHUTDOWN, XLOG_DELAY_XLOG_RECYCLE }, + { DispatchXactRecord, RmgrRecordInfoValid, RM_XACT_ID, XLOG_XACT_COMMIT, XLOG_XACT_ABORT_WITH_XID }, + { DispatchSmgrRecord, RmgrRecordInfoValid, RM_SMGR_ID, XLOG_SMGR_CREATE, XLOG_SMGR_TRUNCATE }, + { DispatchCLogRecord, RmgrRecordInfoValid, RM_CLOG_ID, CLOG_ZEROPAGE, CLOG_TRUNCATE }, + { DispatchDataBaseRecord, RmgrRecordInfoValid, RM_DBASE_ID, XLOG_DBASE_CREATE, XLOG_DBASE_DROP }, + { DispatchTableSpaceRecord, RmgrRecordInfoValid, RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, XLOG_TBLSPC_RELATIVE_CREATE }, + { DispatchMultiXactRecord, + RmgrRecordInfoValid, + RM_MULTIXACT_ID, + XLOG_MULTIXACT_ZERO_OFF_PAGE, + XLOG_MULTIXACT_CREATE_ID }, + { DispatchRelMapRecord, RmgrRecordInfoValid, RM_RELMAP_ID, XLOG_RELMAP_UPDATE, XLOG_RELMAP_UPDATE }, + { DispatchStandbyRecord, RmgrRecordInfoValid, RM_STANDBY_ID, XLOG_STANDBY_LOCK, XLOG_STANDBY_CSN_ABORTED }, + + { DispatchHeap2Record, RmgrRecordInfoValid, RM_HEAP2_ID, XLOG_HEAP2_FREEZE, XLOG_HEAP2_LOGICAL_NEWPAGE }, + { DispatchHeapRecord, RmgrRecordInfoValid, RM_HEAP_ID, XLOG_HEAP_INSERT, XLOG_HEAP_INPLACE }, + { DispatchBtreeRecord, RmgrRecordInfoValid, RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF, XLOG_BTREE_REUSE_PAGE }, + { DispatchHashRecord, RmgrRecordInfoValid, RM_HASH_ID, XLOG_HASH_INIT_META_PAGE, XLOG_HASH_VACUUM_ONE_PAGE }, + { DispatchGinRecord, RmgrRecordInfoValid, RM_GIN_ID, XLOG_GIN_CREATE_INDEX, XLOG_GIN_VACUUM_DATA_LEAF_PAGE }, + /* XLOG_GIST_PAGE_DELETE is not used and info isn't continus */ + { DispatchGistRecord, RmgrGistRecordInfoValid, RM_GIST_ID, 0, 0 }, + { DispatchSeqRecord, RmgrRecordInfoValid, RM_SEQ_ID, XLOG_SEQ_LOG, XLOG_SEQ_LOG }, + { DispatchSpgistRecord, RmgrRecordInfoValid, RM_SPGIST_ID, XLOG_SPGIST_CREATE_INDEX, XLOG_SPGIST_VACUUM_REDIRECT }, + { DispatchRepSlotRecord, RmgrRecordInfoValid, RM_SLOT_ID, XLOG_SLOT_CREATE, XLOG_TERM_LOG }, + { DispatchHeap3Record, RmgrRecordInfoValid, RM_HEAP3_ID, XLOG_HEAP3_NEW_CID, XLOG_HEAP3_INVALID }, + { DispatchBarrierRecord, RmgrRecordInfoValid, RM_BARRIER_ID, XLOG_BARRIER_CREATE, XLOG_BARRIER_SWITCHOVER }, +#ifdef ENABLE_MOT + {DispatchMotRecord, NULL, RM_MOT_ID, 0, 0}, +#endif + { DispatchUHeapRecord, RmgrRecordInfoValid, RM_UHEAP_ID, XLOG_UHEAP_INSERT, XLOG_UHEAP_MULTI_INSERT }, + { DispatchUHeap2Record, RmgrRecordInfoValid, RM_UHEAP2_ID, XLOG_UHEAP2_BASE_SHIFT, XLOG_UHEAP2_EXTEND_TD_SLOTS }, + { DispatchUHeapUndoRecord, RmgrRecordInfoValid, RM_UNDOLOG_ID, XLOG_UNDO_EXTEND, XLOG_UNDO_DISCARD }, + { DispatchUndoActionRecord, RmgrRecordInfoValid, RM_UHEAPUNDO_ID, + XLOG_UHEAPUNDO_PAGE, XLOG_UHEAPUNDO_ABORT_SPECINSERT }, + { DispatchRollbackFinishRecord, RmgrRecordInfoValid, RM_UNDOACTION_ID, XLOG_ROLLBACK_FINISH, XLOG_ROLLBACK_FINISH }, + { DispatchUBTreeRecord, RmgrRecordInfoValid, RM_UBTREE_ID, XLOG_UBTREE_INSERT_LEAF, XLOG_UBTREE_PRUNE_PAGE }, + { DispatchUBTree2Record, RmgrRecordInfoValid, RM_UBTREE2_ID, XLOG_UBTREE2_SHIFT_BASE, + XLOG_UBTREE2_FREEZE }, + { DispatchSegpageSmgrRecord, RmgrRecordInfoValid, RM_SEGPAGE_ID, XLOG_SEG_ATOMIC_OPERATION, + XLOG_SEG_NEW_PAGE}, + { DispatchRepOriginRecord, RmgrRecordInfoValid, RM_REPLORIGIN_ID, XLOG_REPLORIGIN_SET, XLOG_REPLORIGIN_DROP }, + { DispatchCompresseShrinkRecord, RmgrRecordInfoValid, RM_COMPRESSION_REL_ID, XLOG_CFS_SHRINK_OPERATION, + XLOG_CFS_SHRINK_OPERATION }, +}; + +const int REDO_WAIT_SLEEP_TIME = 5000; /* 5ms */ +const int MAX_REDO_WAIT_LOOP = 24000; /* 5ms*24000 = 2min */ + +uint32 GetReadyWorker() +{ + uint32 readyWorkerCnt = 0; + + for (uint32 i = 0; i < g_instance.comm_cxt.predo_cxt.totalNum; i++) { + uint32 state = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadState)); + if (state >= PAGE_REDO_WORKER_READY) { + ++readyWorkerCnt; + } + } + return readyWorkerCnt; +} + +void WaitWorkerReady() +{ + uint32 waitLoop = 0; + uint32 readyWorkerCnt = 0; + /* MAX wait 2min */ + for (waitLoop = 0; waitLoop < MAX_REDO_WAIT_LOOP; ++waitLoop) { + readyWorkerCnt = GetReadyWorker(); + if (readyWorkerCnt == g_instance.comm_cxt.predo_cxt.totalNum) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitWorkerReady total worker count:%u, readyWorkerCnt:%u", + g_dispatcher->allWorkersCnt, readyWorkerCnt))); + break; + } + pg_usleep(REDO_WAIT_SLEEP_TIME); + } + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + g_instance.comm_cxt.predo_cxt.state = REDO_STARTING_END; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + readyWorkerCnt = GetReadyWorker(); + if (waitLoop == MAX_REDO_WAIT_LOOP && readyWorkerCnt == 0) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitWorkerReady failed, no worker is ready for work. totalWorkerCount :%u", + g_dispatcher->allWorkersCnt))); + } + + /* RTO_DEMO */ + if (readyWorkerCnt != g_dispatcher->allWorkersCnt) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitWorkerReady total thread count:%u, readyWorkerCnt:%u, not all thread ready", + g_dispatcher->allWorkersCnt, readyWorkerCnt))); + } +} + +void CheckAlivePageWorkers() +{ + for (uint32 i = 0; i < MAX_RECOVERY_THREAD_NUM; ++i) { + if (g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadState != PAGE_REDO_WORKER_INVALID) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("CheckAlivePageWorkers: thread %lu is still alive", + g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadId))); + } + g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadId = 0; + } + g_instance.comm_cxt.predo_cxt.totalNum = 0; +} + +#ifdef USE_ASSERT_CHECKING +void InitLsnCheckCtl(XLogRecPtr readRecPtr) +{ + g_dispatcher->originLsnCheckAddr = (void *)palloc0(sizeof(LsnCheckCtl) + ONDEMAND_EXTREME_RTO_ALIGN_LEN); + g_dispatcher->lsnCheckCtl = (LsnCheckCtl *)TYPEALIGN(ONDEMAND_EXTREME_RTO_ALIGN_LEN, g_dispatcher->originLsnCheckAddr); + g_dispatcher->lsnCheckCtl->curLsn = readRecPtr; + g_dispatcher->lsnCheckCtl->curPosition = 0; + SpinLockInit(&g_dispatcher->updateLck); +#if (!defined __x86_64__) && (!defined __aarch64__) + SpinLockInit(&g_dispatcher->lsnCheckCtl->ptrLck); +#endif +} +#endif + +void AllocRecordReadBuffer(XLogReaderState *xlogreader, uint32 privateLen) +{ + XLogReaderState *initreader; + errno_t errorno = EOK; + + initreader = GetXlogReader(xlogreader); + initreader->isPRProcess = true; + g_dispatcher->rtoXlogBufState.readWorkerState = WORKER_STATE_STOP; + g_dispatcher->rtoXlogBufState.readPageWorkerState = WORKER_STATE_STOP; + g_dispatcher->rtoXlogBufState.readSource = 0; + g_dispatcher->rtoXlogBufState.failSource = 0; + g_dispatcher->rtoXlogBufState.xlogReadManagerState = READ_MANAGER_RUN; + g_dispatcher->rtoXlogBufState.targetRecPtr = InvalidXLogRecPtr; + g_dispatcher->rtoXlogBufState.expectLsn = InvalidXLogRecPtr; + g_dispatcher->rtoXlogBufState.waitRedoDone = 0; + g_dispatcher->rtoXlogBufState.readsegbuf = (char *)palloc0(XLogSegSize * MAX_ALLOC_SEGNUM); + g_dispatcher->rtoXlogBufState.readBuf = (char *)palloc0(XLOG_BLCKSZ); + g_dispatcher->rtoXlogBufState.readprivate = (void *)palloc0(MAXALIGN(privateLen)); + errorno = memset_s(g_dispatcher->rtoXlogBufState.readprivate, MAXALIGN(privateLen), 0, MAXALIGN(privateLen)); + securec_check(errorno, "", ""); + + g_dispatcher->rtoXlogBufState.errormsg_buf = (char *)palloc0(MAX_ERRORMSG_LEN + 1); + g_dispatcher->rtoXlogBufState.errormsg_buf[0] = '\0'; + + char *readsegbuf = g_dispatcher->rtoXlogBufState.readsegbuf; + for (uint32 i = 0; i < MAX_ALLOC_SEGNUM; i++) { + g_dispatcher->rtoXlogBufState.xlogsegarray[i].readsegbuf = readsegbuf; + readsegbuf += XLogSegSize; + g_dispatcher->rtoXlogBufState.xlogsegarray[i].bufState = NONE; + } + + g_dispatcher->rtoXlogBufState.applyindex = 0; + + g_dispatcher->rtoXlogBufState.readindex = 0; + + g_dispatcher->rtoXlogBufState.xlogsegarray[0].segno = xlogreader->readSegNo; + g_dispatcher->rtoXlogBufState.xlogsegarray[0].segoffset = xlogreader->readOff; + g_dispatcher->rtoXlogBufState.xlogsegarray[0].readlen = xlogreader->readOff + xlogreader->readLen; + + initreader->readBuf = g_dispatcher->rtoXlogBufState.xlogsegarray[0].readsegbuf + + g_dispatcher->rtoXlogBufState.xlogsegarray[0].segoffset; + + errorno = memcpy_s(initreader->readBuf, XLOG_BLCKSZ, xlogreader->readBuf, xlogreader->readLen); + securec_check(errorno, "", ""); + initreader->errormsg_buf = g_dispatcher->rtoXlogBufState.errormsg_buf; + initreader->private_data = g_dispatcher->rtoXlogBufState.readprivate; + CopyDataFromOldReader(initreader, xlogreader); + g_dispatcher->rtoXlogBufState.initreader = initreader; + + g_recordbuffer = &g_dispatcher->rtoXlogBufState; + g_startupTriggerState = TRIGGER_NORMAL; + g_readManagerTriggerFlag = TRIGGER_NORMAL; +#ifdef USE_ASSERT_CHECKING + InitLsnCheckCtl(xlogreader->ReadRecPtr); +#endif +} + +void SSAllocRecordReadBuffer(XLogReaderState *xlogreader, uint32 privateLen) +{ + XLogReaderState *initreader; + errno_t errorno = EOK; + + initreader = GetXlogReader(xlogreader); + initreader->isPRProcess = true; + g_dispatcher->rtoXlogBufState.readWorkerState = WORKER_STATE_STOP; + g_dispatcher->rtoXlogBufState.readPageWorkerState = WORKER_STATE_STOP; + g_dispatcher->rtoXlogBufState.readSource = 0; + g_dispatcher->rtoXlogBufState.failSource = 0; + g_dispatcher->rtoXlogBufState.xlogReadManagerState = READ_MANAGER_RUN; + g_dispatcher->rtoXlogBufState.targetRecPtr = InvalidXLogRecPtr; + g_dispatcher->rtoXlogBufState.expectLsn = InvalidXLogRecPtr; + g_dispatcher->rtoXlogBufState.waitRedoDone = 0; + g_dispatcher->rtoXlogBufState.readBuf = (char *)palloc0(XLOG_BLCKSZ); + g_dispatcher->rtoXlogBufState.readprivate = (void *)palloc0(MAXALIGN(privateLen)); + errorno = memset_s(g_dispatcher->rtoXlogBufState.readprivate, MAXALIGN(privateLen), 0, MAXALIGN(privateLen)); + securec_check(errorno, "", ""); + + g_dispatcher->rtoXlogBufState.errormsg_buf = (char *)palloc0(MAX_ERRORMSG_LEN + 1); + g_dispatcher->rtoXlogBufState.errormsg_buf[0] = '\0'; + + initreader->readBuf = g_dispatcher->rtoXlogBufState.readBuf; + errorno = memcpy_s(initreader->readBuf, XLOG_BLCKSZ, xlogreader->readBuf, xlogreader->readLen); + securec_check(errorno, "", ""); + initreader->errormsg_buf = g_dispatcher->rtoXlogBufState.errormsg_buf; + initreader->private_data = g_dispatcher->rtoXlogBufState.readprivate; + CopyDataFromOldReader(initreader, xlogreader); + g_dispatcher->rtoXlogBufState.initreader = initreader; + + g_recordbuffer = &g_dispatcher->rtoXlogBufState; + g_startupTriggerState = TRIGGER_NORMAL; + g_readManagerTriggerFlag = TRIGGER_NORMAL; +#ifdef USE_ASSERT_CHECKING + InitLsnCheckCtl(xlogreader->ReadRecPtr); +#endif +} + +void HandleStartupInterruptsForExtremeRto() +{ + Assert(AmStartupProcess()); + + uint32 newtriggered = (uint32)CheckForSatartupStatus(); + if (newtriggered != TRIGGER_NORMAL) { + uint32 triggeredstate = pg_atomic_read_u32(&(g_startupTriggerState)); + if (triggeredstate != newtriggered) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("HandleStartupInterruptsForExtremeRto:g_startupTriggerState set from %u to %u", + triggeredstate, newtriggered))); + pg_atomic_write_u32(&(g_startupTriggerState), newtriggered); + } + } + + if (t_thrd.startup_cxt.got_SIGHUP) { + t_thrd.startup_cxt.got_SIGHUP = false; + SendSingalToPageWorker(SIGHUP); + ProcessConfigFile(PGC_SIGHUP); + } + + if (t_thrd.startup_cxt.shutdown_requested) { + if (g_instance.status != SmartShutdown) { + proc_exit(1); + } else { + g_dispatcher->smartShutdown = true; + } + } + if (t_thrd.startup_cxt.check_repair) { + t_thrd.startup_cxt.check_repair = false; + } +} + +/* Run from the dispatcher thread. */ +void StartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen) +{ + if (get_real_recovery_parallelism() > 1) { + if (t_thrd.xlog_cxt.StandbyModeRequested) { + ReLeaseRecoveryLatch(); + } + + CheckAlivePageWorkers(); + g_dispatcher = CreateDispatcher(); + g_dispatcher->oldCtx = MemoryContextSwitchTo(g_instance.comm_cxt.predo_cxt.parallelRedoCtx); + g_instance.comm_cxt.redoItemCtx = AllocSetContextCreate((MemoryContext)g_instance.instance_context, + "redoItemSharedMemory", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, + SHARED_CONTEXT); + g_instance.comm_cxt.predo_cxt.redoItemHash = PRRedoItemHashInitialize(g_instance.comm_cxt.redoItemCtx); + g_dispatcher->maxItemNum = ((get_batch_redo_num() + 4) * PAGE_WORK_QUEUE_SIZE) * ITEM_QUQUE_SIZE_RATIO; + uint32 maxParseBufNum = (uint32)((uint64)g_instance.attr.attr_storage.dms_attr.ondemand_recovery_mem_size * + 1024 / (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc) + sizeof(RedoMemSlot))); + g_dispatcher->maxItemNum = 4 * PAGE_WORK_QUEUE_SIZE * ITEM_QUQUE_SIZE_RATIO + maxParseBufNum; + XLogParseBufferInitFunc(&(g_dispatcher->parseManager), maxParseBufNum, &recordRefOperate, RedoInterruptCallBack); + /* alloc for record readbuf */ + SSAllocRecordReadBuffer(xlogreader, privateLen); + StartPageRedoWorkers(get_real_recovery_parallelism()); + + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[PR]: max=%d, thrd=%d", g_instance.attr.attr_storage.max_recovery_parallelism, + get_real_recovery_parallelism()))); + WaitWorkerReady(); + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + g_instance.comm_cxt.predo_cxt.state = REDO_IN_PROGRESS; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + on_shmem_exit(StopRecoveryWorkers, 0); + + g_dispatcher->oldStartupIntrruptFunc = RegisterRedoInterruptCallBack(HandleStartupInterruptsForExtremeRto); + + close_readFile_if_open(); + } +} + +void DumpDispatcher() +{ + knl_parallel_redo_state state; + PageRedoPipeline *pl = NULL; + state = g_instance.comm_cxt.predo_cxt.state; + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]dispatcher : totalWorkerCount %d, state %u, curItemNum %u, maxItemNum %u", + get_real_recovery_parallelism(), (uint32)state, g_dispatcher->curItemNum, + g_dispatcher->maxItemNum))); + + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + pl = &(g_dispatcher->pageLines[i]); + DumpPageRedoWorker(pl->batchThd); + DumpPageRedoWorker(pl->managerThd); + for (uint32 j = 0; j < pl->redoThdNum; j++) { + DumpPageRedoWorker(pl->redoThd[j]); + } + } + DumpPageRedoWorker(g_dispatcher->trxnLine.managerThd); + DumpPageRedoWorker(g_dispatcher->trxnLine.redoThd); + DumpXlogCtl(); + } +} + +/* Run from the dispatcher thread. */ +STATIC LogDispatcher *CreateDispatcher() +{ + MemoryContext ctx = AllocSetContextCreate(g_instance.instance_context, "ParallelRecoveryDispatcher", + ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE, SHARED_CONTEXT); + + LogDispatcher *newDispatcher = (LogDispatcher *)MemoryContextAllocZero(ctx, sizeof(LogDispatcher)); + + g_instance.comm_cxt.predo_cxt.parallelRedoCtx = ctx; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + g_instance.comm_cxt.predo_cxt.state = REDO_STARTING_BEGIN; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + newDispatcher->totalCostTime = 0; + newDispatcher->txnCostTime = 0; + newDispatcher->pprCostTime = 0; + newDispatcher->syncEnterCount = 0; + newDispatcher->syncExitCount = 0; + newDispatcher->batchThrdEnterNum = 0; + newDispatcher->batchThrdExitNum = 0; + newDispatcher->segpageXactDoneFlag = 0; + + pg_atomic_init_u32(&(newDispatcher->standbyState), STANDBY_INITIALIZED); + newDispatcher->needImmediateCheckpoint = false; + newDispatcher->needFullSyncCheckpoint = false; + newDispatcher->smartShutdown = false; + newDispatcher->startupTimeCost = t_thrd.xlog_cxt.timeCost; + return newDispatcher; +} + +void RedoRoleInit(PageRedoWorker **dstWk, PageRedoWorker *srcWk, RedoRole role, + uint32 slotId, bool isUndoSpaceWorker) +{ + *dstWk = srcWk; + (*dstWk)->role = role; + (*dstWk)->slotId = slotId; + (*dstWk)->isUndoSpaceWorker = isUndoSpaceWorker; +} + +/* Run from the dispatcher thread. */ +static void StartPageRedoWorkers(uint32 totalThrdNum) +{ + uint32 batchNum = get_batch_redo_num(); + uint32 batchWorkerPerMng = get_page_redo_worker_num_per_manager(); + uint32 undoSpaceWorkersNum = get_recovery_undozidworkers_num(); + uint32 workerCnt = 0; + PageRedoWorker **tmpWorkers; + uint32 started; + ereport(LOG, (errmsg("StartPageRedoWorkers, totalThrdNum:%u, " + "batchNum:%u, batchWorkerPerMng is %u", + totalThrdNum, batchNum, batchWorkerPerMng))); + + g_dispatcher->allWorkers = (PageRedoWorker **)palloc0(sizeof(PageRedoWorker *) * totalThrdNum); + g_dispatcher->allWorkersCnt = totalThrdNum; + g_dispatcher->pageLines = (PageRedoPipeline *)palloc(sizeof(PageRedoPipeline) * batchNum); + + for (started = 0; started < totalThrdNum; started++) { + g_dispatcher->allWorkers[started] = CreateWorker(started); + if (g_dispatcher->allWorkers[started] == NULL) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]StartPageRedoWorkers CreateWorker failed, started:%u", started))); + } + } + tmpWorkers = g_dispatcher->allWorkers; + for (uint32 i = 0; i < batchNum; i++) { + bool isUndoSpaceWorker = false; + if (i >= (batchNum - undoSpaceWorkersNum)) { + isUndoSpaceWorker = true; + } + RedoRoleInit(&(g_dispatcher->pageLines[i].batchThd), tmpWorkers[workerCnt++], REDO_BATCH, i, isUndoSpaceWorker); + RedoRoleInit(&(g_dispatcher->pageLines[i].managerThd), tmpWorkers[workerCnt++], REDO_PAGE_MNG, i, + isUndoSpaceWorker); + g_dispatcher->pageLines[i].redoThd = (PageRedoWorker **)palloc(sizeof(PageRedoWorker *) * batchWorkerPerMng); + g_dispatcher->pageLines[i].chosedRTIds = (uint32 *)palloc(sizeof(uint32) * batchWorkerPerMng); + g_dispatcher->pageLines[i].chosedRTCnt = 0; + for (uint32 j = 0; j < batchWorkerPerMng; j++) { + RedoRoleInit(&(g_dispatcher->pageLines[i].redoThd[j]), tmpWorkers[workerCnt++], REDO_PAGE_WORKER, j, + isUndoSpaceWorker); + } + g_dispatcher->pageLines[i].redoThdNum = batchWorkerPerMng; + } + + RedoRoleInit(&(g_dispatcher->trxnLine.managerThd), tmpWorkers[workerCnt++], REDO_TRXN_MNG, 0, false); + RedoRoleInit(&(g_dispatcher->trxnLine.redoThd), tmpWorkers[workerCnt++], REDO_TRXN_WORKER, 0, false); + + RedoRoleInit(&(g_dispatcher->readLine.managerThd), tmpWorkers[workerCnt++], REDO_READ_MNG, 0, false); + RedoRoleInit(&(g_dispatcher->readLine.readPageThd), tmpWorkers[workerCnt++], REDO_READ_PAGE_WORKER, 0, false); + RedoRoleInit(&(g_dispatcher->readLine.readThd), tmpWorkers[workerCnt++], REDO_READ_WORKER, 0, false); + + for (started = 0; started < totalThrdNum; started++) { + if (StartPageRedoWorker(g_dispatcher->allWorkers[started]) == NULL) { + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]StartPageRedoWorkers StartPageRedoWorker failed, started:%u", started))); + } + } + + Assert(totalThrdNum == workerCnt); + g_dispatcher->pageLineNum = batchNum; + g_instance.comm_cxt.predo_cxt.totalNum = workerCnt; + g_dispatcher->chosedPageLineIds = (uint32 *)palloc(sizeof(uint32) * batchNum); + g_dispatcher->chosedPLCnt = 0; +} + +static void ResetChosedPageLineList() +{ + g_dispatcher->chosedPLCnt = 0; + + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + g_dispatcher->chosedPageLineIds[i] = 0; + } +} + +bool DispathCouldExit() +{ + for (uint32 i = 0; i < g_instance.comm_cxt.predo_cxt.totalNum; ++i) { + uint32 state = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadState)); + if (state == PAGE_REDO_WORKER_READY) { + return false; + } + } + + return true; +} + +void SetPageWorkStateByThreadId(uint32 threadState) +{ + gs_thread_t curThread = gs_thread_get_cur_thread(); + for (uint32 i = 0; i < g_instance.comm_cxt.predo_cxt.totalNum; ++i) { + if (g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadId == curThread.thid) { + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadState), threadState); + break; + } + } +} + +void SendSingalToPageWorker(int signal) +{ + for (uint32 i = 0; i < g_instance.comm_cxt.predo_cxt.totalNum; ++i) { + uint32 state = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadState)); + if (state == PAGE_REDO_WORKER_READY) { + int err = gs_signal_send(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadId, signal); + if (0 != err) { + ereport(WARNING, (errmsg("Dispatch kill(pid %lu, signal %d) failed: \"%s\",", + g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[i].threadId, signal, + gs_strerror(err)))); + } + } + } +} + +/* Run from the dispatcher thread. */ +static void StopRecoveryWorkers(int code, Datum arg) +{ + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("parallel redo workers are going to stop, code:%d, arg:%lu", + code, DatumGetUInt64(arg)))); + SendSingalToPageWorker(SIGTERM); + + uint64 count = 0; + while (!DispathCouldExit()) { + ++count; + if ((count & OUTPUT_WAIT_COUNT) == OUTPUT_WAIT_COUNT) { + ereport(WARNING, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("StopRecoveryWorkers wait page work exit"))); + if ((count & PRINT_ALL_WAIT_COUNT) == PRINT_ALL_WAIT_COUNT) { + DumpDispatcher(); + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("StopRecoveryWorkers wait too long!!!"))); + } + pg_usleep(EXIT_WAIT_DELAY); + } + } + + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readWorkerState, WORKER_STATE_EXIT); + ShutdownWalRcv(); + FreeAllocatedRedoItem(); + SSDestroyRecoveryWorkers(); + g_startupTriggerState = TRIGGER_NORMAL; + g_readManagerTriggerFlag = TRIGGER_NORMAL; + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("parallel redo(startup) thread exit"))); +} + +/* Run from the dispatcher thread. */ +static void SSDestroyRecoveryWorkers() +{ + if (g_dispatcher != NULL) { + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + DestroyPageRedoWorker(g_dispatcher->pageLines[i].batchThd); + DestroyPageRedoWorker(g_dispatcher->pageLines[i].managerThd); + for (uint32 j = 0; j < g_dispatcher->pageLines[i].redoThdNum; j++) { + DestroyPageRedoWorker(g_dispatcher->pageLines[i].redoThd[j]); + } + if (g_dispatcher->pageLines[i].chosedRTIds != NULL) { + pfree(g_dispatcher->pageLines[i].chosedRTIds); + } + } + DestroyPageRedoWorker(g_dispatcher->trxnLine.managerThd); + DestroyPageRedoWorker(g_dispatcher->trxnLine.redoThd); + + DestroyPageRedoWorker(g_dispatcher->readLine.managerThd); + DestroyPageRedoWorker(g_dispatcher->readLine.readThd); + pfree(g_dispatcher->rtoXlogBufState.readBuf); + pfree(g_dispatcher->rtoXlogBufState.errormsg_buf); + pfree(g_dispatcher->rtoXlogBufState.readprivate); +#ifdef USE_ASSERT_CHECKING + if (g_dispatcher->originLsnCheckAddr != NULL) { + pfree(g_dispatcher->originLsnCheckAddr); + g_dispatcher->originLsnCheckAddr = NULL; + g_dispatcher->lsnCheckCtl = NULL; + } +#endif + if (get_real_recovery_parallelism() > 1) { + (void)MemoryContextSwitchTo(g_dispatcher->oldCtx); + MemoryContextDelete(g_instance.comm_cxt.predo_cxt.parallelRedoCtx); + g_instance.comm_cxt.predo_cxt.parallelRedoCtx = NULL; + } + g_dispatcher = NULL; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + } +} + +static bool RmgrRecordInfoValid(XLogReaderState *record, uint8 minInfo, uint8 maxInfo) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + switch (XLogRecGetRmid(record)) { + case RM_HEAP2_ID: + case RM_HEAP_ID: { + info = (info & XLOG_HEAP_OPMASK); + break; + } + case RM_MULTIXACT_ID: { + info = (info & XLOG_MULTIXACT_MASK); + break; + } + case RM_UHEAP_ID: + case RM_UNDOLOG_ID: + case RM_UHEAPUNDO_ID: + case RM_UNDOACTION_ID: { + info = (info & XLOG_UHEAP_OPMASK); + break; + } + default: + break; + } + + info = (info >> XLOG_INFO_SHIFT_SIZE); + minInfo = (minInfo >> XLOG_INFO_SHIFT_SIZE); + maxInfo = (maxInfo >> XLOG_INFO_SHIFT_SIZE); + + if ((info >= minInfo) && (info <= maxInfo)) { + return true; + } + return false; +} + +static bool RmgrGistRecordInfoValid(XLogReaderState *record, uint8 minInfo, uint8 maxInfo) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if ((info == XLOG_GIST_PAGE_UPDATE) || (info == XLOG_GIST_PAGE_SPLIT) || (info == XLOG_GIST_CREATE_INDEX)) { + return true; + } + + return false; +} + +/* Run from the dispatcher thread. */ +void DispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool fatalerror = false; + uint32 indexid = RM_NEXT_ID; + + Assert(record != NULL); + + uint32 rmid = XLogRecGetRmid(record); + uint32 term = XLogRecGetTerm(record); + if (term > g_instance.comm_cxt.localinfo_cxt.term_from_xlog) { + g_instance.comm_cxt.localinfo_cxt.term_from_xlog = term; + } + t_thrd.xlog_cxt.redoItemIdx = 0; + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + if (rmid <= RM_MAX_ID) { + indexid = g_dispatchTable[rmid].rm_id; + if ((indexid != rmid) || + ((g_dispatchTable[rmid].rm_loginfovalid != NULL) && + (g_dispatchTable[rmid].rm_loginfovalid(record, g_dispatchTable[rmid].rm_mininfo, + g_dispatchTable[rmid].rm_maxinfo) == false))) { + /* it's invalid info */ + fatalerror = true; + } + } else { + fatalerror = true; + } + ResetChosedPageLineList(); + if (fatalerror != true) { +#ifdef ENABLE_UT + TestXLogReaderProbe(UTEST_EVENT_RTO_DISPATCH_REDO_RECORD_TO_FILE, __FUNCTION__, record); +#endif + g_dispatchTable[rmid].rm_dispatch(record, expectedTLIs, recordXTime); + } else { + DispatchDefaultRecord(record, expectedTLIs, recordXTime); + DumpDispatcher(); + DumpItem(GetRedoItemPtr(record), "DispatchRedoRecordToFile"); + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchRedoRecord encounter fatal error:rmgrID:%u, info:%u, indexid:%u", + rmid, (uint32)XLogRecGetInfo(record), indexid))); + } + } else { + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchRedoRecord could not be here config recovery num %d, work num %u", + get_real_recovery_parallelism(), GetBatchCount()))); + } +} + +/** + * process record need sync with page worker and trxn thread + * trxnthreadexe is true when the record need execute on trxn thread + * pagethredexe is true when the record need execute on pageworker thread + */ +static void DispatchSyncTxnRecord(XLogReaderState *record, List *expectedTLIs) +{ + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + + if ((g_dispatcher->chosedPLCnt != 1) && (XLogRecGetRmid(&item->record) != RM_XACT_ID)) { + ereport(WARNING, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchSyncTxnRecord maybe some error:rmgrID:%u, info:%u, workerCount:%u", + XLogRecGetRmid(&item->record), XLogRecGetInfo(&item->record), g_dispatcher->chosedPLCnt))); + } + + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + } + + /* ensure eyery pageworker is receive recored to update pageworker Lsn + * trxn record's recordtime must set , see SetLatestXTime + */ + AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, item); + return; +} + +static void DispatchToOnePageWorker(XLogReaderState *record, const RelFileNode rnode, List *expectedTLIs) +{ + /* for bcm different attr need to dispath to the same page redo thread */ + uint32 slotId = GetSlotId(rnode, 0, 0, GetBatchCount()); + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[slotId].batchThd, item); +} + +static void DispatchToSpecificOnePageWorker(XLogReaderState *record, uint32 slotId, List *expectedTLIs) +{ + Assert(slotId <= GetBatchCount()); + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[slotId].batchThd, item); +} + +/** +* The transaction worker waits until every page worker has replayed +* all records before this. We dispatch a LSN marker to every page +* worker so they can update their progress. +* +* We need to dispatch to page workers first, because the transaction +* worker runs in the dispatcher thread and may block wait on page +* workers. +* ensure eyery pageworker is receive recored to update pageworker Lsn +* trxn record's recordtime must set , see SetLatestXTime + +*/ +static void DispatchTxnRecord(XLogReaderState *record, List *expectedTLIs) +{ + RedoItem *trxnItem = GetRedoItemPtr(record); + ReferenceRedoItem(trxnItem); + AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, trxnItem); +} + +/* Run from the dispatcher thread. */ +static bool DispatchBarrierRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + RedoItem *item = GetRedoItemPtr(record); + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + ReferenceRedoItem(item); + if (info != XLOG_BARRIER_COMMIT) { + item->record.isFullSync = true; + } + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + + AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, item); + return false; +} + +#ifdef ENABLE_MOT +static bool DispatchMotRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return false; +} +#endif + +/* Run from the dispatcher thread. */ +static bool DispatchRepSlotRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchHeap3Record(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (info == XLOG_HEAP3_INVALID) { + DispatchRecordWithPages(record, expectedTLIs); + } else { + DispatchTxnRecord(record, expectedTLIs); + } + return false; +} + +/* record of rmid or info error, we inter this function to make every worker run to this position */ +static bool DispatchDefaultRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return true; +} + +/* Run from the dispatcher thread. */ +static bool DispatchXLogRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (IsCheckPoint(record)) { + return isNeedFullSync; + } else if ((info == XLOG_FPI) || (info == XLOG_FPI_FOR_HINT)) { + DispatchRecordWithPages(record, expectedTLIs); + } else { + /* process in trxn thread and need to sync to other pagerredo thread */ + DispatchTxnRecord(record, expectedTLIs); + } + + return isNeedFullSync; +} + +/* Run from the dispatcher thread. */ +static bool DispatchRelMapRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + /* page redo worker directly use relnode, will not use relmapfile */ + DispatchTxnRecord(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchXactRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + if (XactWillRemoveRelFiles(record)) { + bool hasSegpageRelFile = XactHasSegpageRelFiles(record); + uint32 doneFlag = 0; + + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + AddSlotToPLSet(i); + } + + if (hasSegpageRelFile) { + doneFlag = 0; + pg_atomic_compare_exchange_u32((volatile uint32 *)&g_dispatcher->segpageXactDoneFlag, &doneFlag, 1); + } + + /* sync with trxn thread */ + /* trx execute drop action, pageworker forger invalid page, + * pageworker first exe and update lastcomplateLSN + * then trx thread exe + * first pageworker execute and update lsn, then trxn thread */ + DispatchSyncTxnRecord(record, expectedTLIs); + + if (hasSegpageRelFile) { + doneFlag = pg_atomic_read_u32(&g_dispatcher->segpageXactDoneFlag); + while (doneFlag != 0) { + RedoInterruptCallBack(); + doneFlag = pg_atomic_read_u32(&g_dispatcher->segpageXactDoneFlag); + } + } + } else { + /* process in trxn thread and need to sync to other pagerredo thread */ + DispatchTxnRecord(record, expectedTLIs); + } + + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchStandbyRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + /* change standbystate, must be full sync, see UpdateStandbyState */ + bool isNeedFullSync = StandbyWillChangeStandbyState(record); + + DispatchTxnRecord(record, expectedTLIs); + + return isNeedFullSync; +} + +/* Run from the dispatcher thread. */ +static bool DispatchMultiXactRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + /* page worker will not use multixact */ + DispatchTxnRecord(record, expectedTLIs); + + return false; +} + +/* Run from the dispatcher thread. */ +static void DispatchRecordWithoutPage(XLogReaderState *record, List *expectedTLIs) +{ + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + DereferenceRedoItem(item); +} + +/* Run from the dispatcher thread. */ +static void DispatchRecordWithPages(XLogReaderState *record, List *expectedTLIs) +{ + GetSlotIds(record); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + } + DereferenceRedoItem(item); +} + +static bool DispatchHeapRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + if (record->max_block_id >= 0) + DispatchRecordWithPages(record, expectedTLIs); + else + DispatchRecordWithoutPage(record, expectedTLIs); + + return false; +} + +static bool DispatchSeqRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchRecordWithPages(record, expectedTLIs); + + return false; +} + +static bool DispatchDataBaseRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + + if (IsDataBaseDrop(record)) { + isNeedFullSync = true; + RedoItem *item = GetRedoItemPtr(record); + + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + DereferenceRedoItem(item); + } else { + /* database dir may impact many rel so need to sync to all pageworks */ + DispatchRecordWithoutPage(record, expectedTLIs); + g_dispatcher->needFullSyncCheckpoint = true; + } + + g_dispatcher->needImmediateCheckpoint = true; + return isNeedFullSync; +} + +static bool DispatchTableSpaceRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + RedoItem *item = GetRedoItemPtr(record); + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + if (info == XLOG_TBLSPC_CREATE || info == XLOG_TBLSPC_RELATIVE_CREATE) { + item->record.isFullSync = true; + } + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + AddTxnRedoItem(g_dispatcher->trxnLine.managerThd, item); + + g_dispatcher->needImmediateCheckpoint = true; + return false; +} + +static bool DispatchSmgrRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (info == XLOG_SMGR_CREATE) { + /* only need to dispatch to one page worker */ + xl_smgr_create *xlrec = (xl_smgr_create *)XLogRecGetData(record); + RelFileNode rnode; + RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record)); + rnode.opt = GetCreateXlogFileNodeOpt(record); + DispatchToOnePageWorker(record, rnode, expectedTLIs); + } else if (IsSmgrTruncate(record)) { + xl_smgr_truncate *xlrec = (xl_smgr_truncate *)XLogRecGetData(record); + RelFileNode rnode; + RelFileNodeCopy(rnode, xlrec->rnode, XLogRecGetBucketId(record)); + rnode.opt = GetTruncateXlogFileNodeOpt(record); + uint32 id = GetSlotId(rnode, 0, 0, GetBatchCount()); + AddSlotToPLSet(id); + + DispatchToSpecPageWorker(record, expectedTLIs); + } + + return isNeedFullSync; +} + +static void DispatchRecordBySegHeadBuffer(XLogReaderState* record, List* expectedTLIs, uint32 segHeadBlockId) +{ + RelFileNode rnode; + BlockNumber blknum; + XLogRecGetBlockTag(record, segHeadBlockId, &rnode, NULL, &blknum); + rnode.relNode = blknum; + + DispatchToOnePageWorker(record, rnode, expectedTLIs); +} + +static bool DispatchSegpageSmgrRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + switch (info) { + case XLOG_SEG_ATOMIC_OPERATION: + case XLOG_SEG_SEGMENT_EXTEND: + case XLOG_SEG_CREATE_EXTENT_GROUP: + case XLOG_SEG_INIT_MAPPAGE: + case XLOG_SEG_INIT_INVRSPTR_PAGE: + case XLOG_SEG_ADD_NEW_GROUP: + case XLOG_SEG_SPACE_SHRINK: + case XLOG_SEG_SPACE_DROP: + case XLOG_SEG_NEW_PAGE: + DispatchToSpecificOnePageWorker(record, 0, expectedTLIs); + break; + case XLOG_SEG_TRUNCATE: + DispatchRecordBySegHeadBuffer(record, expectedTLIs, 0); + break; + default: + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[SS][REDO_LOG_TRACE] xlog info %u doesn't belong to segpage.", info))); + } + + return isNeedFullSync; +} + +/* Run from the dispatcher thread. */ +static bool DispatchRepOriginRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchCLogRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchHashRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + + /* index not support mvcc, so we need to sync with trx thread when the record is vacuum */ + if (IsHashVacuumPages(record) && g_supportHotStandby) { + GetSlotIds(record); + /* sync with trxn thread */ + /* only need to process in pageworker thread, wait trxn sync */ + /* pageworker exe, trxn don't need exe */ + DispatchToSpecPageWorker(record, expectedTLIs); + } else { + DispatchRecordWithPages(record, expectedTLIs); + } + + return isNeedFullSync; +} + +/* for cfs row-compression. */ +static bool DispatchCompresseShrinkRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchTxnRecord(record, expectedTLIs); + return true; +} + +static bool DispatchBtreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + if (info == XLOG_BTREE_REUSE_PAGE) { + DispatchTxnRecord(record, expectedTLIs); + } else { + DispatchRecordWithPages(record, expectedTLIs); + } + + return false; +} + +static bool DispatchUBTreeRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + if (info == XLOG_UBTREE_REUSE_PAGE) { + DispatchTxnRecord(record, expectedTLIs); + } else { + DispatchRecordWithPages(record, expectedTLIs); + } + + return false; +} + +static bool DispatchUBTree2Record(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + DispatchRecordWithPages(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchGinRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (info == XLOG_GIN_DELETE_LISTPAGE) { + ginxlogDeleteListPages *data = (ginxlogDeleteListPages *)XLogRecGetData(record); + /* output warning */ + if (data->ndeleted != record->max_block_id) { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchGinRecord warnninginfo:ndeleted:%d, max_block_id:%d", + data->ndeleted, record->max_block_id))); + } + } + + /* index not support mvcc, so we need to sync with trx thread when the record is vacuum */ + if (IsGinVacuumPages(record) && g_supportHotStandby) { + GetSlotIds(record); + /* sync with trxn thread */ + /* only need to process in pageworker thread, wait trxn sync */ + /* pageworker exe, trxn don't need exe */ + DispatchToSpecPageWorker(record, expectedTLIs); + } else { + DispatchRecordWithPages(record, expectedTLIs); + } + + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchGistRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = (XLogRecGetInfo(record) & (~XLR_INFO_MASK)); + + if (info == XLOG_GIST_PAGE_SPLIT) { + gistxlogPageSplit *xldata = (gistxlogPageSplit *)XLogRecGetData(record); + /* output warning */ + if (xldata->npage != record->max_block_id) { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchGistRecord warnninginfo:npage:%u, max_block_id:%d", + xldata->npage, record->max_block_id))); + } + } + + DispatchRecordWithPages(record, expectedTLIs); + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchSpgistRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + DispatchRecordWithPages(record, expectedTLIs); + return false; +} + +/** + * dispatch record to a specified thread + */ +static void DispatchToSpecPageWorker(XLogReaderState *record, List *expectedTLIs) +{ + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + + if (g_dispatcher->chosedPLCnt != 1) { + ereport(WARNING, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DispatchToSpecPageWorker maybe some error:rmgrID:%u, info:%u, workerCount:%u", + XLogRecGetRmid(&item->record), XLogRecGetInfo(&item->record), g_dispatcher->chosedPLCnt))); + } + + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + } + + DereferenceRedoItem(item); +} + +static bool DispatchHeap2VacuumRecord(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + /* + * don't support consistency view + */ + uint8 info = ((XLogRecGetInfo(record) & (~XLR_INFO_MASK)) & XLOG_HEAP_OPMASK); + + if (info == XLOG_HEAP2_CLEANUP_INFO) { + DispatchTxnRecord(record, expectedTLIs); + } else { + DispatchRecordWithPages(record, expectedTLIs); + } + + return false; +} + +/* Run from the dispatcher thread. */ +static bool DispatchHeap2Record(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime) +{ + bool isNeedFullSync = false; + + uint8 info = ((XLogRecGetInfo(record) & (~XLR_INFO_MASK)) & XLOG_HEAP_OPMASK); + + if (info == XLOG_HEAP2_MULTI_INSERT) { + DispatchRecordWithPages(record, expectedTLIs); + } else if (info == XLOG_HEAP2_BCM) { + /* we use renode as dispatch key, so the same relation will dispath to the same page redo thread + * although they have different fork num + */ + /* for parallel redo performance */ + xl_heap_bcm *xlrec = (xl_heap_bcm *)XLogRecGetData(record); + RelFileNode tmp_node; + RelFileNodeCopy(tmp_node, xlrec->node, XLogRecGetBucketId(record)); + DispatchToOnePageWorker(record, tmp_node, expectedTLIs); + + } else if (info == XLOG_HEAP2_LOGICAL_NEWPAGE) { + if (IS_DN_MULTI_STANDYS_MODE()) { + xl_heap_logical_newpage *xlrec = (xl_heap_logical_newpage *)XLogRecGetData(record); + + if (xlrec->type == COLUMN_STORE && xlrec->hasdata) { + RelFileNode tmp_node; + RelFileNodeCopy(tmp_node, xlrec->node, XLogRecGetBucketId(record)); + DispatchToOnePageWorker(record, tmp_node, expectedTLIs); + } else { + RedoItem *item = GetRedoItemPtr(record); +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("LOGICAL NEWPAGE %X/%X type:%u, hasdata:%u no need replay", + (uint32)(record->EndRecPtr >> 32), (uint32)(record->EndRecPtr), + (uint32)xlrec->type, (uint32)xlrec->hasdata))); + for (int i = 0; i <= item->record.max_block_id; ++i) { + if (item->record.blocks[i].in_use) { + item->record.blocks[i].replayed = 1; + } + } +#endif + FreeRedoItem(item); + } + } else { + if (!g_instance.attr.attr_storage.enable_mix_replication) { + isNeedFullSync = true; + DispatchTxnRecord(record, expectedTLIs); + } else { + RedoItem *item = GetRedoItemPtr(record); +#ifdef USE_ASSERT_CHECKING + ereport(LOG, (errmsg("LOGICAL NEWPAGE %X/%X not multistandby,no need replay", + (uint32)(record->EndRecPtr >> 32), (uint32)(record->EndRecPtr)))); + for (int i = 0; i <= item->record.max_block_id; ++i) { + if (item->record.blocks[i].in_use) { + item->record.blocks[i].replayed = 1; + } + } +#endif + FreeRedoItem(item); + } + } + } else { + isNeedFullSync = DispatchHeap2VacuumRecord(record, expectedTLIs, recordXTime); + } + + return isNeedFullSync; +} + +/* Run from the dispatcher thread. */ +static void GetSlotIds(XLogReaderState *record) +{ + for (int i = 0; i <= record->max_block_id; i++) { + DecodedBkpBlock *block = &record->blocks[i]; + + if (block->in_use) { + uint32 id = GetSlotId(block->rnode, 0, 0, GetBatchCount()); + AddSlotToPLSet(id); + } + } +} + +/** + * count slot id by hash + */ +uint32 GetSlotId(const RelFileNode node, BlockNumber block, ForkNumber forkNum, uint32 workerCount) +{ + uint32 undoSpaceWorkersNum = get_recovery_undozidworkers_num(); + workerCount = workerCount - undoSpaceWorkersNum; + + if (workerCount == 0) + return ANY_WORKER; + + return tag_hash((const void*)&node.relNode, sizeof(node.relNode)) % workerCount; +} + +/* Run from the dispatcher thread. */ +static void GetUndoSlotIds(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 op = info & XLOG_UHEAP_OPMASK; + int size = 0; + + switch (op) { + case XLOG_UHEAP_INSERT: { + size = SizeOfUHeapInsert; + break; + } + case XLOG_UHEAP_DELETE: { + size = SizeOfUHeapDelete; + break; + } + case XLOG_UHEAP_UPDATE: { + size = SizeOfUHeapUpdate; + break; + } + case XLOG_UHEAP_MULTI_INSERT: { + size = 0; + break; + } + case XLOG_UHEAP_FREEZE_TD_SLOT: + case XLOG_UHEAP_INVALID_TD_SLOT: + case XLOG_UHEAP_CLEAN: { + /* No undo actions to redo */ + return; + } + default: + ereport(ERROR, (errmsg("Invalid op in DispatchUHeapRecord"))); + } + + /* Get slot id for undo zone */ + char *xlrec = XLogRecGetData(record); + XlUndoHeader *xlundohdr = (XlUndoHeader *)(xlrec + size); + int zoneid = UNDO_PTR_GET_ZONE_ID(xlundohdr->urecptr); + uint32 undoSlotId = GetUndoSpaceWorkerId(zoneid); + + AddSlotToPLSet(undoSlotId); +} + +static void AddSlotToPLSet(uint32 id) +{ + if (id >= g_dispatcher->pageLineNum) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]AddWorkerToSet:input work id error, id:%u, batch work num %u", id, + g_dispatcher->pageLineNum))); + return; + } + + if (g_dispatcher->chosedPageLineIds[id] == 0) { + g_dispatcher->chosedPLCnt += 1; + } + ++(g_dispatcher->chosedPageLineIds[id]); +} + +/* Run from the dispatcher thread. */ +static bool StandbyWillChangeStandbyState(const XLogReaderState *record) +{ + /* + * If standbyState has reached SNAPSHOT_READY, it will not change + * anymore. Otherwise, it will change if the log record's redo + * function calls ProcArrayApplyRecoveryInfo(). + */ + if ((t_thrd.xlog_cxt.standbyState < STANDBY_SNAPSHOT_READY) && (XLogRecGetRmid(record) == RM_STANDBY_ID) && + ((XLogRecGetInfo(record) & (~XLR_INFO_MASK)) == XLOG_RUNNING_XACTS)) { + /* change standbystate, must be full sync, see UpdateStandbyState */ + return true; + } + + return false; +} + +#ifdef USE_ASSERT_CHECKING +void ItemBlocksOfItemIsReplayed(RedoItem *item) +{ + for (uint32 i = 0; i <= XLR_MAX_BLOCK_ID; ++i) { + if (item->record.blocks[i].in_use) { + if (item->record.blocks[i].forknum == MAIN_FORKNUM) { + Assert((item->record.blocks[i].replayed == 1)); + } + } else { + Assert((item->record.blocks[i].replayed == 0)); + } + } +} + +void GetLsnCheckInfo(uint64 *curPosition, XLogRecPtr *curLsn) +{ + volatile LsnCheckCtl *checkCtl = g_dispatcher->lsnCheckCtl; +#if defined(__x86_64__) || defined(__aarch64__) + uint128_u current = atomic_compare_and_swap_u128((uint128_u *)&checkCtl->curPosition); + Assert(sizeof(checkCtl->curPosition) == sizeof(uint64)); + Assert(sizeof(checkCtl->curLsn) == sizeof(XLogRecPtr)); + + *curPosition = current.u64[0]; + *curLsn = current.u64[1]; +#else + SpinLockAcquire(&checkCtl->ptrLck); + *curPosition = checkCtl->curPosition; + *curLsn = checkCtl->curLsn; + SpinLockRelease(&checkCtl->ptrLck); +#endif +} + +void SetLsnCheckInfo(uint64 curPosition, XLogRecPtr curLsn) +{ + volatile LsnCheckCtl *checkCtl = g_dispatcher->lsnCheckCtl; +#if defined(__x86_64__) || defined(__aarch64__) + uint128_u exchange; + + uint128_u compare = atomic_compare_and_swap_u128((uint128_u *)&checkCtl->curPosition); + Assert(sizeof(checkCtl->curPosition) == sizeof(uint64)); + Assert(sizeof(checkCtl->curLsn) == sizeof(XLogRecPtr)); + + exchange.u64[0] = curPosition; + exchange.u64[1] = curLsn; + + uint128_u current = atomic_compare_and_swap_u128((uint128_u *)&checkCtl->curPosition, compare, exchange); + Assert(compare.u128 == current.u128); +#else + SpinLockAcquire(&checkCtl->ptrLck); + checkCtl->curPosition = curPosition; + checkCtl->curLsn = curLsn; + SpinLockRelease(&checkCtl->ptrLck); +#endif /* __x86_64__ */ +} + +bool PushCheckLsn() +{ + uint64 curPosition; + XLogRecPtr curLsn; + GetLsnCheckInfo(&curPosition, &curLsn); + uint32 len = pg_atomic_read_u32(&g_dispatcher->lsnCheckCtl->lsnCheckBuf[curPosition]); + + if (len == 0) { + return false; + } + + // someone else changed it, no need to do it + if (!pg_atomic_compare_exchange_u32(&g_dispatcher->lsnCheckCtl->lsnCheckBuf[curPosition], &len, 0)) { + return false; + } + + SetLsnCheckInfo((curPosition + len) % LSN_CHECK_BUF_SIZE, curLsn + len); + return true; +} + +void ItemLsnCheck(RedoItem *item) +{ + uint64 curPosition; + XLogRecPtr curLsn; + GetLsnCheckInfo(&curPosition, &curLsn); + XLogRecPtr endPtr = item->record.EndRecPtr; + if (endPtr % XLogSegSize == 0) { + XLByteAdvance(endPtr, SizeOfXLogLongPHD); + } else if (endPtr % XLOG_BLCKSZ == 0) { + XLByteAdvance(endPtr, SizeOfXLogShortPHD); + } + uint32 len = (uint32)(endPtr - item->record.ReadRecPtr); + + uint64 nextPosition = (curPosition + (item->record.ReadRecPtr - curLsn)) % LSN_CHECK_BUF_SIZE; + pg_atomic_write_u32(&g_dispatcher->lsnCheckCtl->lsnCheckBuf[nextPosition], len); + + SpinLockAcquire(&g_dispatcher->updateLck); + while (PushCheckLsn()) { + } + SpinLockRelease(&g_dispatcher->updateLck); +} + +void AllItemCheck() +{ + RedoItem *nextItem = g_dispatcher->allocatedRedoItem; + while (nextItem != NULL) { + Assert((nextItem->record.refcount == 0)); + nextItem = nextItem->allocatedNext; + } +} + +#endif + +void ClearRecordInfo(XLogReaderState *xlogState) +{ + xlogState->decoded_record = NULL; + xlogState->main_data = NULL; + xlogState->main_data_len = 0; + + for (int i = 0; i <= xlogState->max_block_id; ++i) { + xlogState->blocks[i].data = NULL; + xlogState->blocks[i].data_len = 0; + xlogState->blocks[i].in_use = false; + xlogState->blocks[i].has_image = false; + xlogState->blocks[i].has_data = false; + xlogState->blocks[i].tdeinfo = NULL; +#ifdef USE_ASSERT_CHECKING + xlogState->blocks[i].replayed = 0; +#endif + } + xlogState->max_block_id = -1; + if (xlogState->readRecordBufSize > BIG_RECORD_LENGTH) { + pfree(xlogState->readRecordBuf); + xlogState->readRecordBuf = NULL; + xlogState->readRecordBufSize = 0; + } + + xlogState->isDecode = false; + xlogState->isFullSync = false; + xlogState->refcount = 0; +} + +/* Run from each page worker thread. */ +void FreeRedoItem(RedoItem *item) +{ + if (item->record.isDecode) { +#ifdef USE_ASSERT_CHECKING + ItemBlocksOfItemIsReplayed(item); + ItemLsnCheck(item); +#endif + CountXLogNumbers(&item->record); + } + ClearRecordInfo(&item->record); + pg_write_barrier(); + RedoItem *oldHead = (RedoItem *)pg_atomic_read_uintptr((uintptr_t *)&g_dispatcher->freeHead); + do { + pg_atomic_write_uintptr((uintptr_t *)&item->freeNext, (uintptr_t)oldHead); + } while (!pg_atomic_compare_exchange_uintptr((uintptr_t *)&g_dispatcher->freeHead, (uintptr_t *)&oldHead, + (uintptr_t)item)); +} + +void InitReaderStateByOld(XLogReaderState *newState, XLogReaderState *oldState, bool isNew) +{ + if (isNew) { + newState->read_page = oldState->read_page; + newState->system_identifier = oldState->system_identifier; + newState->private_data = oldState->private_data; + newState->errormsg_buf = oldState->errormsg_buf; + newState->isPRProcess = oldState->isPRProcess; + } + + newState->ReadRecPtr = oldState->ReadRecPtr; + newState->EndRecPtr = oldState->EndRecPtr; + newState->readSegNo = oldState->readSegNo; + newState->readOff = oldState->readOff; + newState->readPageTLI = oldState->readPageTLI; + newState->curReadSegNo = oldState->curReadSegNo; + newState->curReadOff = oldState->curReadOff; + newState->latestPagePtr = oldState->latestPagePtr; + newState->latestPageTLI = oldState->latestPageTLI; + newState->currRecPtr = oldState->currRecPtr; + newState->readBuf = oldState->readBuf; + newState->readLen = oldState->readLen; + newState->preReadStartPtr = oldState->preReadStartPtr; + newState->preReadBuf = oldState->preReadBuf; + + newState->decoded_record = NULL; + newState->main_data = NULL; + newState->main_data_len = 0; + + newState->max_block_id = -1; + newState->readblocks = 0; + /* move block clear to FreeRedoItem because we used MCXT_ALLOC_ZERO to alloc buf, if the variable is not init to 0, + you should put it here. */ + +} + +static XLogReaderState *GetXlogReader(XLogReaderState *readerState) +{ + RedoItem *newItem = NULL; + bool isNew = false; + uint64 count = 0; + do { + if (g_dispatcher->freeStateHead != NULL) { + newItem = g_dispatcher->freeStateHead; + g_dispatcher->freeStateHead = newItem->freeNext; + break; + } else { + RedoItem *head = (RedoItem *)pg_atomic_exchange_uintptr((uintptr_t *)&g_dispatcher->freeHead, + (uintptr_t)NULL); + if (head != NULL) { + pg_read_barrier(); + newItem = head; + g_dispatcher->freeStateHead = newItem->freeNext; + break; + } else if (g_dispatcher->maxItemNum > g_dispatcher->curItemNum) { + newItem = (RedoItem *)palloc_extended(MAXALIGN(sizeof(RedoItem)), MCXT_ALLOC_NO_OOM | MCXT_ALLOC_ZERO); + if (newItem != NULL) { + newItem->allocatedNext = g_dispatcher->allocatedRedoItem; + g_dispatcher->allocatedRedoItem = newItem; + isNew = true; + ++(g_dispatcher->curItemNum); + break; + } + } + + ++count; + if ((count & OUTPUT_WAIT_COUNT) == OUTPUT_WAIT_COUNT) { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("GetXlogReader Allocated record buffer failed!, cur item:%u, max item:%u", + g_dispatcher->curItemNum, g_dispatcher->maxItemNum))); + if ((count & PRINT_ALL_WAIT_COUNT) == PRINT_ALL_WAIT_COUNT) { + DumpDispatcher(); + } + } + if (newItem == NULL) { + RedoInterruptCallBack(); + } + } + } while (newItem == NULL); + + InitReaderStateByOld(&newItem->record, readerState, isNew); + newItem->freeNext = NULL; + + return &newItem->record; +} + + +void CopyDataFromOldReader(XLogReaderState *newReaderState, const XLogReaderState *oldReaderState) +{ + errno_t rc = EOK; + if ((newReaderState->readRecordBuf == NULL) || + (oldReaderState->readRecordBufSize > newReaderState->readRecordBufSize)) { + if (!allocate_recordbuf(newReaderState, oldReaderState->readRecordBufSize)) { + ereport(PANIC, + (errmodule(MOD_REDO), + errcode(ERRCODE_LOG), + errmsg("Allocated record buffer failed!, cur item:%u, max item:%u", + g_dispatcher->curItemNum, + g_dispatcher->maxItemNum))); + } + } + + rc = memcpy_s(newReaderState->readRecordBuf, + newReaderState->readRecordBufSize, + oldReaderState->readRecordBuf, + oldReaderState->readRecordBufSize); + securec_check(rc, "\0", "\0"); + newReaderState->decoded_record = (XLogRecord *)newReaderState->readRecordBuf; + newReaderState->max_block_id = oldReaderState->max_block_id; + rc = memcpy_s(newReaderState->blocks, sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1), oldReaderState->blocks, + sizeof(DecodedBkpBlock) * (XLR_MAX_BLOCK_ID + 1)); + securec_check(rc, "\0", "\0"); + for (int i = 0; i <= oldReaderState->max_block_id; i++) { + if (oldReaderState->blocks[i].has_image) + newReaderState->blocks[i].bkp_image = + (char *)((uintptr_t)newReaderState->decoded_record + + ((uintptr_t)oldReaderState->blocks[i].bkp_image - (uintptr_t)oldReaderState->decoded_record)); + if (oldReaderState->blocks[i].has_data) { + newReaderState->blocks[i].data = (char *)((uintptr_t)newReaderState->decoded_record + + ((uintptr_t)oldReaderState->blocks[i].data - (uintptr_t)oldReaderState->decoded_record)); + newReaderState->blocks[i].data_len = oldReaderState->blocks[i].data_len; + } + } + if (oldReaderState->main_data_len > 0) { + newReaderState->main_data = + (char*)((uintptr_t)newReaderState->decoded_record + + ((uintptr_t)oldReaderState->main_data - (uintptr_t)oldReaderState->decoded_record)); + newReaderState->main_data_len = oldReaderState->main_data_len; + } +} + +XLogReaderState *NewReaderState(XLogReaderState *readerState) +{ + Assert(readerState != NULL); + if (!readerState->isPRProcess) + return readerState; + if (DispatchPtrIsNull()) + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("NewReaderState Dispatch is null"))); + + XLogReaderState *retReaderState = GetXlogReader(readerState); + return retReaderState; +} + +void FreeAllocatedRedoItem() +{ + while ((g_dispatcher != NULL) && (g_dispatcher->allocatedRedoItem != NULL)) { + RedoItem *pItem = g_dispatcher->allocatedRedoItem; + g_dispatcher->allocatedRedoItem = pItem->allocatedNext; + XLogReaderState *tmpRec = &(pItem->record); + + if (tmpRec->readRecordBuf) { + pfree(tmpRec->readRecordBuf); + tmpRec->readRecordBuf = NULL; + } + + pfree(pItem); + } +} + +/* Run from the dispatcher thread. */ +void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code) +{ + ereport( + LOG, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]SendRecoveryEndMarkToWorkersAndWaitForFinish, ready to stop redo workers, code: %d", + code))); + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readPageThd); + PageRedoPipeline *pl = g_dispatcher->pageLines; + /* send end mark */ + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + SendPageRedoEndMark(pl[i].batchThd); + } + SendPageRedoEndMark(g_dispatcher->trxnLine.managerThd); + + /* wait */ + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + WaitPageRedoWorkerReachLastMark(pl[i].batchThd); + } + pg_atomic_write_u32(&(g_dispatcher->rtoXlogBufState.xlogReadManagerState), READ_MANAGER_STOP); + + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.managerThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->trxnLine.managerThd); + LsnUpdate(); +#ifdef USE_ASSERT_CHECKING + AllItemCheck(); +#endif + (void)RegisterRedoInterruptCallBack(g_dispatcher->oldStartupIntrruptFunc); + } +} + +void SendRecoveryEndMarkToWorkersAndWaitForReach(int code) +{ + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[SS][REDO_LOG_TRACE] On-demand recovery dispatch finish, send RecoveryEndMark to workers, code: %d", + code))); + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readPageThd); + PageRedoPipeline *pl = g_dispatcher->pageLines; + + /* Read finish, need to check if can go to Phase two */ + XLogRecPtr lastReadEndPtr = g_dispatcher->readLine.readPageThd->lastReplayedEndRecPtr; + + /* Wait for trxn finished replay and redo hash table complete */ + while (true) { + XLogRecPtr trxnCompletePtr = GetCompletedRecPtr(g_dispatcher->trxnLine.redoThd); + XLogRecPtr pageMngrCompletePtr = InvalidXLogRecPtr; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if (g_dispatcher->allWorkers[i]->role == REDO_PAGE_MNG) { + XLogRecPtr tmpStart = MAX_XLOG_REC_PTR; + XLogRecPtr tmpEnd = MAX_XLOG_REC_PTR; + GetCompletedReadEndPtr(g_dispatcher->allWorkers[i], &tmpStart, &tmpEnd); + if (XLByteLT(tmpEnd, pageMngrCompletePtr) || pageMngrCompletePtr == InvalidXLogRecPtr) { + pageMngrCompletePtr = tmpEnd; + } + } + } + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] lastReadXact: %lu, trxnComplete: %lu, pageMgrComplele: %lu", + lastReadEndPtr, trxnCompletePtr, pageMngrCompletePtr))); + if (XLByteEQ(trxnCompletePtr, lastReadEndPtr) && XLByteEQ(pageMngrCompletePtr, lastReadEndPtr)) { + break; + } + + long sleeptime = 5 * 1000; + pg_usleep(sleeptime); + } + /* we only send end mark but don't wait */ + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + SendPageRedoEndMark(pl[i].batchThd); + } + SendPageRedoEndMark(g_dispatcher->trxnLine.managerThd); + + /* Stop Read Thrd only */ + pg_atomic_write_u32(&(g_dispatcher->rtoXlogBufState.xlogReadManagerState), READ_MANAGER_STOP); + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.managerThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->readLine.readThd); + LsnUpdate(); + XLogRecPtr lastReplayed = GetXLogReplayRecPtr(NULL); + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] Current LastReplayed: %lu", lastReplayed))); + (void)RegisterRedoInterruptCallBack(g_dispatcher->oldStartupIntrruptFunc); + } +} + +void WaitRedoFinish() +{ + /* make pmstate as run so db can accept service from now */ + g_instance.fatal_error = false; + g_instance.demotion = NoDemote; + pmState = PM_RUN; + write_stderr_with_prefix("[On-demand] LOG: database system is ready to accept connections"); + + SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = true; + SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_RECOVERY; + SSSaveReformerCtrl(); + +#ifdef USE_ASSERT_CHECKING + XLogRecPtr minStart = MAX_XLOG_REC_PTR; + XLogRecPtr minEnd = MAX_XLOG_REC_PTR; + GetReplayedRecPtr(&minStart, &minEnd); + ereport(LOG, (errmsg("[SS][REDO_LOG_TRACE] Current LastReplayed: %lu", minEnd))); +#endif + + PageRedoPipeline *pl = g_dispatcher->pageLines; + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + WaitPageRedoWorkerReachLastMark(pl[i].batchThd); + } + WaitPageRedoWorkerReachLastMark(g_dispatcher->trxnLine.managerThd); + XLogParseBufferDestoryFunc(&(g_dispatcher->parseManager)); + LsnUpdate(); +#ifdef USE_ASSERT_CHECKING + AllItemCheck(); +#endif + SpinLockAcquire(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = true; + SpinLockRelease(&t_thrd.shemem_ptr_cxt.XLogCtl->info_lck); +} + +/* Run from each page worker and the txn worker thread. */ +int GetDispatcherExitCode() +{ + return (int)pg_atomic_read_u32((uint32 *)&g_dispatcher->exitCode); +} + +/* Run from the dispatcher thread. */ +uint32 GetAllWorkerCount() +{ + return g_dispatcher == NULL ? 0 : g_dispatcher->allWorkersCnt; +} + +/* Run from the dispatcher thread. */ +uint32 GetBatchCount() +{ + return g_dispatcher == NULL ? 0 : g_dispatcher->pageLineNum; +} + +bool DispatchPtrIsNull() +{ + return (g_dispatcher == NULL); +} + +/* Run from each page worker thread. */ +PGPROC *StartupPidGetProc(ThreadId pid) +{ + if (pid == g_instance.proc_base->startupProcPid) + return g_instance.proc_base->startupProc; + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PGPROC *proc = GetPageRedoWorkerProc(g_dispatcher->allWorkers[i]); + if (pid == proc->pid) + return proc; + } + } + return NULL; +} + +/* Run from the dispatcher and txn worker thread. */ +void UpdateStandbyState(HotStandbyState newState) +{ + PageRedoPipeline *pl = NULL; + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + pl = &(g_dispatcher->pageLines[i]); + UpdatePageRedoWorkerStandbyState(pl->batchThd, newState); + UpdatePageRedoWorkerStandbyState(pl->managerThd, newState); + for (uint32 j = 0; j < pl->redoThdNum; j++) { + UpdatePageRedoWorkerStandbyState(pl->redoThd[j], newState); + } + } + UpdatePageRedoWorkerStandbyState(g_dispatcher->trxnLine.managerThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->trxnLine.redoThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.managerThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.readPageThd, newState); + UpdatePageRedoWorkerStandbyState(g_dispatcher->readLine.readThd, newState); + pg_atomic_write_u32(&(g_dispatcher->standbyState), newState); + } +} + +void UpdateMinRecoveryForTrxnRedoThd(XLogRecPtr newMinRecoveryPoint) +{ + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + g_dispatcher->trxnLine.redoThd->minRecoveryPoint = newMinRecoveryPoint; + } +} + +/* Run from the dispatcher thread. */ +void **GetXLogInvalidPagesFromWorkers() +{ + return CollectStatesFromWorkers(GetXLogInvalidPages); +} + +/* Run from the dispatcher thread. */ +static void **CollectStatesFromWorkers(GetStateFunc getStateFunc) +{ + if (g_dispatcher->allWorkersCnt > 0) { + void **stateArray = (void **)palloc(sizeof(void *) * g_dispatcher->allWorkersCnt); + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) + stateArray[i] = getStateFunc(g_dispatcher->allWorkers[i]); + return stateArray; + } else + return NULL; +} + +XLogRecPtr GetSafeMinCheckPoint() +{ + XLogRecPtr minSafeCheckPoint = MAX_XLOG_REC_PTR; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if (g_dispatcher->allWorkers[i]->role == REDO_PAGE_WORKER) { + if (XLByteLT(g_dispatcher->allWorkers[i]->lastCheckedRestartPoint, minSafeCheckPoint)) { + minSafeCheckPoint = g_dispatcher->allWorkers[i]->lastCheckedRestartPoint; + } + } + } + + return minSafeCheckPoint; +} + +void GetReplayedRecPtr(XLogRecPtr *startPtr, XLogRecPtr *endPtr) +{ + XLogRecPtr minStart = MAX_XLOG_REC_PTR; + XLogRecPtr minEnd = MAX_XLOG_REC_PTR; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if ((g_dispatcher->allWorkers[i]->role == REDO_PAGE_WORKER) || + (g_dispatcher->allWorkers[i]->role == REDO_TRXN_WORKER)) { + XLogRecPtr tmpStart = MAX_XLOG_REC_PTR; + XLogRecPtr tmpEnd = MAX_XLOG_REC_PTR; + GetCompletedReadEndPtr(g_dispatcher->allWorkers[i], &tmpStart, &tmpEnd); + if (XLByteLT(tmpEnd, minEnd)) { + minStart = tmpStart; + minEnd = tmpEnd; + } + } + } + *startPtr = minStart; + *endPtr = minEnd; +} + +RedoWaitInfo redo_get_io_event(int32 event_id) +{ + WaitStatisticsInfo *tmpStatics = NULL; + RedoWaitInfo resultInfo; + resultInfo.counter = 0; + resultInfo.total_duration = 0; + PgBackendStatus *beentry = NULL; + int index = MAX_BACKEND_SLOT + StartupProcess; + + if (IS_PGSTATE_TRACK_UNDEFINE || t_thrd.shemem_ptr_cxt.BackendStatusArray == NULL) { + return resultInfo; + } + + beentry = t_thrd.shemem_ptr_cxt.BackendStatusArray + index; + tmpStatics = &(beentry->waitInfo.event_info.io_info[event_id - WAIT_EVENT_BUFFILE_READ]); + resultInfo.total_duration = tmpStatics->total_duration; + resultInfo.counter = tmpStatics->counter; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + if (g_dispatcher == NULL || g_dispatcher->allWorkers == NULL || + g_instance.comm_cxt.predo_cxt.state != REDO_IN_PROGRESS) { + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + return resultInfo; + } + + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + if (g_dispatcher->allWorkers[i] == NULL) { + break; + } + index = g_dispatcher->allWorkers[i]->index; + beentry = t_thrd.shemem_ptr_cxt.BackendStatusArray + index; + tmpStatics = &(beentry->waitInfo.event_info.io_info[event_id - WAIT_EVENT_BUFFILE_READ]); + resultInfo.total_duration += tmpStatics->total_duration; + resultInfo.counter += tmpStatics->counter; + } + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + + return resultInfo; +} + +void redo_get_worker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) +{ + PageRedoWorker *redoWorker = NULL; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + if (g_dispatcher == NULL) { + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + *realNum = 0; + return; + } + *realNum = g_dispatcher->pageLineNum; + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + redoWorker = (g_dispatcher->pageLines[i].batchThd); + worker[i].id = redoWorker->id; + worker[i].queue_usage = SPSCGetQueueCount(redoWorker->queue); + worker[i].queue_max_usage = (uint32)(pg_atomic_read_u32(&((redoWorker->queue)->maxUsage))); + worker[i].redo_rec_count = (uint32)(pg_atomic_read_u64(&((redoWorker->queue)->totalCnt))); + } + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); +} + +void make_worker_static_info(RedoWorkerTimeCountsInfo *workerCountInfo, PageRedoWorker *redoWorker, + int piplineid, int id) +{ + const uint32 pipelineNumSize = 2; + const uint32 redoWorkerNumSize = 2; + const char *role_name = RedoWokerRole2Str(redoWorker->role); + uint32 allocSize = strlen(role_name) + pipelineNumSize + 1 + redoWorkerNumSize + 1; + workerCountInfo->worker_name = (char*)palloc0(allocSize); + if (id != invalid_worker_id) { + errno_t rc = sprintf_s(workerCountInfo->worker_name, allocSize, "%s%02d%02d", role_name, piplineid, id); + securec_check_ss(rc, "\0", "\0"); + } else { + errno_t rc = sprintf_s(workerCountInfo->worker_name, allocSize, "%s%02d", role_name, piplineid); + securec_check_ss(rc, "\0", "\0"); + } + workerCountInfo->time_cost = redoWorker->timeCostList; +} + +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) +{ + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + knl_parallel_redo_state state = g_instance.comm_cxt.predo_cxt.state; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + + if (state != REDO_IN_PROGRESS) { + *realNum = 0; + return; + } + + PageRedoWorker *redoWorker = NULL; + + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + if (g_dispatcher == NULL) { + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + *realNum = 0; + return; + } + *realNum = g_dispatcher->allWorkersCnt + 1; + RedoWorkerTimeCountsInfo *workerList = + (RedoWorkerTimeCountsInfo *)palloc0((*realNum) * sizeof(RedoWorkerTimeCountsInfo)); + errno_t rc; + uint32 cur_pos = 0; + uint32 allocSize; + for (int i = 0; i < (int)g_dispatcher->pageLineNum; ++i) { + redoWorker = (g_dispatcher->pageLines[i].batchThd); + make_worker_static_info(&workerList[cur_pos++], redoWorker, i, invalid_worker_id); + + redoWorker = (g_dispatcher->pageLines[i].managerThd); + make_worker_static_info(&workerList[cur_pos++], redoWorker, i, invalid_worker_id); + + for (int j = 0; j < (int)g_dispatcher->pageLines[i].redoThdNum; ++j) { + redoWorker = (g_dispatcher->pageLines[i].redoThd[j]); + make_worker_static_info(&workerList[cur_pos++], redoWorker, i, j); + } + } + + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->trxnLine.managerThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->trxnLine.redoThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.readPageThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.readThd, 0, invalid_worker_id); + make_worker_static_info(&workerList[cur_pos++], g_dispatcher->readLine.managerThd, 0, invalid_worker_id); + + const char *startupName = "startup"; + allocSize = strlen(startupName) + 1; + workerList[cur_pos].worker_name = (char*)palloc0(allocSize); + rc = sprintf_s(workerList[cur_pos].worker_name, allocSize, "%s", startupName); + securec_check_ss(rc, "\0", "\0"); + workerList[cur_pos++].time_cost = g_dispatcher->startupTimeCost; + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); + *workerCountInfoList = workerList; + Assert(cur_pos == *realNum); +} + +void CheckCommittingCsnList() +{ +#ifndef ENABLE_MULTIPLE_NODES + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + CleanUpMakeCommitAbort(reinterpret_cast(g_dispatcher->allWorkers[i]->committingCsnList)); + g_dispatcher->allWorkers[i]->committingCsnList = NULL; + } +#else + TransactionId clean_xid = InvalidTransactionId; + if (!IS_PGXC_COORDINATOR && t_thrd.proc->workingVersionNum >= DISASTER_READ_VERSION_NUM) { + if (log_min_messages <= DEBUG4) { + ereport(LOG, (errmsg("CheckCommittingCsnList: insert clean xlog"))); + } + XLogBeginInsert(); + XLogRegisterData((char*)(&clean_xid), sizeof(TransactionId)); + XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_CSN_ABORTED); + } +#endif +} + +/* uheap dispatch functions */ +static bool DispatchUHeapRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + GetSlotIds(record); + GetUndoSlotIds(record); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + elog(DEBUG1, "Dispatch page worker %d", i); + } + } + DereferenceRedoItem(item); + + return false; +} + +static bool DispatchUHeap2Record(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + GetSlotIds(record); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + elog(DEBUG1, "Dispatch page worker %d", i); + } + } + DereferenceRedoItem(item); + + return false; +} + +static bool DispatchUHeapUndoRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 op = info & XLOG_UHEAP_OPMASK; + char *opName = NULL; + int zoneId = 0; + + switch (op) { + case XLOG_UNDO_DISCARD: { + undo::XlogUndoDiscard *xlrec = (undo::XlogUndoDiscard *)XLogRecGetData(record); + zoneId = UNDO_PTR_GET_ZONE_ID(xlrec->startSlot); + opName = "UNDO_DISCARD"; + break; + } + case XLOG_UNDO_UNLINK: + case XLOG_SLOT_UNLINK: { + undo::XlogUndoUnlink *xlrec = (undo::XlogUndoUnlink *)XLogRecGetData(record); + zoneId = UNDO_PTR_GET_ZONE_ID(xlrec->head); + opName = "UNDO_UNLINK"; + break; + } + case XLOG_UNDO_EXTEND: + case XLOG_SLOT_EXTEND: { + undo::XlogUndoExtend *xlrec = (undo::XlogUndoExtend *) XLogRecGetData(record); + zoneId = UNDO_PTR_GET_ZONE_ID(xlrec->tail); + opName = "UNDO_ALLOCATE"; + break; + } + default: { + elog(ERROR, "Invalid op in DispatchUHeapUndoRecord: %u", (uint8) op); + } + } + + uint32 undoWorkerId = GetUndoSpaceWorkerId(zoneId); + AddSlotToPLSet(undoWorkerId); + elog(DEBUG1, "Dispatch %s xid(%lu) lsn(%016lx) undo worker zid %d, undoWorkerId %d", + opName, XLogRecGetXid(record), record->EndRecPtr, zoneId, undoWorkerId); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + elog(DEBUG1, "Dispatch page worker %d", i); + } + } + DereferenceRedoItem(item); + + return false; +} + +static bool DispatchUndoActionRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 op = info & XLOG_UHEAP_OPMASK; + + switch (op) { + case XLOG_UHEAPUNDO_RESET_SLOT: { + elog(DEBUG1, "Dispatch UHEAPUNDO_RESET_SLOT xid(%lu) lsn(%016lx)", + XLogRecGetXid(record), record->EndRecPtr); + break; + } + case XLOG_UHEAPUNDO_PAGE: { + elog(DEBUG1, "Dispatch XLOG_UHEAPUNDO_PAGE xid(%lu) lsn(%016lx)", + XLogRecGetXid(record), record->EndRecPtr); + break; + } + case XLOG_UHEAPUNDO_ABORT_SPECINSERT: { + elog(DEBUG1, "Dispatch XLOG_UHEAPUNDO_ABORT_SPECINSERT xid(%lu) lsn(%016lx)", + XLogRecGetXid(record), record->EndRecPtr); + break; + } + default: { + elog(ERROR, "Invalid op in DispatchUndoActionRecord: %u", (uint8) op); + } + } + + GetSlotIds(record); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + elog(DEBUG1, "Dispatch page worker %d", i); + } + } + DereferenceRedoItem(item); + + return false; +} + +static bool DispatchRollbackFinishRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + uint8 op = info & XLOG_UHEAP_OPMASK; + + switch (op) { + case XLOG_ROLLBACK_FINISH: { + undo::XlogRollbackFinish *xlrec = (undo::XlogRollbackFinish *)XLogRecGetData(record); + uint32 undoWorkerId = 0; + + undoWorkerId = GetUndoSpaceWorkerId((int)UNDO_PTR_GET_ZONE_ID(xlrec->slotPtr)); + AddSlotToPLSet(undoWorkerId); + elog(DEBUG1, "Dispatch ROLLBACK_FINISH xid(%lu) lsn(%016lx) undo worker zid %d, undoWorkerId %d", + XLogRecGetXid(record), record->EndRecPtr, (int)UNDO_PTR_GET_ZONE_ID(xlrec->slotPtr), undoWorkerId); + + RedoItem *item = GetRedoItemPtr(record); + ReferenceRedoItem(item); + for (uint32 i = 0; i < g_dispatcher->pageLineNum; i++) { + if (g_dispatcher->chosedPageLineIds[i] > 0) { + ReferenceRedoItem(item); + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + elog(DEBUG1, "Dispatch page worker %d", i); + } + } + DereferenceRedoItem(item); + break; + } + default: { + elog(ERROR, "Invalid op in DispatchRollbackFinishRecord: %u", (uint8) op); + } + } + + return false; +} + +static inline uint32 GetUndoSpaceWorkerId(int zid) +{ + uint32 workerCount = GetBatchCount(); + uint32 undoSpaceWorkersNum = get_recovery_undozidworkers_num(); + int firstUndoLogWorker = (workerCount - undoSpaceWorkersNum); + + if (workerCount == 0) + return ANY_WORKER; + + Assert(undoSpaceWorkersNum != 0); + + return (tag_hash(&zid, sizeof(zid)) % undoSpaceWorkersNum + firstUndoLogWorker); +} + + +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4ec868350f37e198eb89549109743d2663c41c99 --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/page_redo.cpp @@ -0,0 +1,2934 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * page_redo.cpp + * PageRedoWorker is a thread of execution that replays data page logs. + * It provides a synchronization mechanism for replaying logs touching + * multiple pages. + * + * In the current implementation, logs modifying the same page must + * always be replayed by the same worker. There is no mechanism for + * an idle worker to "steal" work from a busy worker. + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/parallel_recovery/page_redo.cpp + * + * ------------------------------------------------------------------------- + */ + +#include +#include + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "gs_thread.h" +#include "miscadmin.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlog_internal.h" +#include "access/xlogutils.h" +#include "access/xlogproc.h" +#include "access/nbtree.h" +#include "catalog/storage_xlog.h" +#include "ddes/dms/ss_dms_recovery.h" +#include "gssignal/gs_signal.h" +#include "libpq/pqsignal.h" +#include "postmaster/postmaster.h" +#include "storage/ipc.h" +#include "storage/freespace.h" +#include "storage/smgr/smgr.h" +#include "storage/smgr/relfilenode_hash.h" +#include "storage/standby.h" +#include "storage/pmsignal.h" +#include "utils/guc.h" +#include "utils/palloc.h" +#include "portability/instr_time.h" +#include "postmaster/startup.h" +#include "postmaster/pagerepair.h" +#include "catalog/storage.h" +#include +#include +#include "commands/dbcommands.h" +#include "commands/tablespace.h" +#include "access/ondemand_extreme_rto/page_redo.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/txn_redo.h" +#include "access/ondemand_extreme_rto/xlog_read.h" +#include "pgstat.h" +#include "access/ondemand_extreme_rto/batch_redo.h" +#include "access/multi_redo_api.h" +#include "replication/walreceiver.h" +#include "replication/datareceiver.h" +#include "pgxc/barrier.h" +#include "storage/file/fio_device.h" +#ifdef ENABLE_MOT +#include "storage/mot/mot_fdw.h" +#endif + +#ifdef EXTREME_RTO_DEBUG +#include +#include + +#include + +#include + +#endif + +#ifdef ENABLE_UT +#include "utils/utesteventutil.h" +#define STATIC +#else +#define STATIC static +#endif + +namespace ondemand_extreme_rto { +static const int MAX_PARSE_BUFF_NUM = PAGE_WORK_QUEUE_SIZE * 10 * 3; +static const int MAX_LOCAL_BUFF_NUM = PAGE_WORK_QUEUE_SIZE * 10 * 3; + +static const char *const PROCESS_TYPE_CMD_ARG = "--forkpageredo"; +static char g_AUXILIARY_TYPE_CMD_ARG[16] = {0}; + +THR_LOCAL PageRedoWorker *g_redoWorker = NULL; +THR_LOCAL RecordBufferState *g_recordbuffer = NULL; +RedoItem g_redoEndMark = { false, false, NULL, 0, NULL, 0 }; +RedoItem g_terminateMark = { false, false, NULL, 0, NULL, 0 }; +RedoItem g_GlobalLsnForwarder; +RedoItem g_cleanupMark; +RedoItem g_closefdMark; +RedoItem g_cleanInvalidPageMark; + +static const int PAGE_REDO_WORKER_ARG = 3; +static const int REDO_SLEEP_50US = 50; +static const int REDO_SLEEP_100US = 100; + +static void ApplySinglePageRecord(RedoItem *); +static void InitGlobals(); +static void LastMarkReached(); +static void SetupSignalHandlers(); +static void SigHupHandler(SIGNAL_ARGS); +static ThreadId StartWorkerThread(PageRedoWorker *); + +void RedoThrdWaitForExit(const PageRedoWorker *wk); +void AddRefRecord(void *rec); +void SubRefRecord(void *rec); +void GlobalLsnUpdate(); +static void TrxnMangerQueueCallBack(); +#ifdef USE_ASSERT_CHECKING +void RecordBlockCheck(void *rec, XLogRecPtr curPageLsn, uint32 blockId, bool replayed); +#endif +void AddRecordReadBlocks(void *rec, uint32 readblocks); + +RefOperate recordRefOperate = { + AddRefRecord, + SubRefRecord, +#ifdef USE_ASSERT_CHECKING + RecordBlockCheck, +#endif + AddRecordReadBlocks, +}; + +void UpdateRecordGlobals(RedoItem *item, HotStandbyState standbyState) +{ + t_thrd.xlog_cxt.ReadRecPtr = item->record.ReadRecPtr; + t_thrd.xlog_cxt.EndRecPtr = item->record.EndRecPtr; + t_thrd.xlog_cxt.expectedTLIs = item->expectedTLIs; + /* apply recoveryinfo will change standbystate see UpdateRecordGlobals */ + t_thrd.xlog_cxt.standbyState = standbyState; + t_thrd.xlog_cxt.XLogReceiptTime = item->syncXLogReceiptTime; + t_thrd.xlog_cxt.XLogReceiptSource = item->syncXLogReceiptSource; + u_sess->utils_cxt.RecentXmin = item->RecentXmin; + t_thrd.xlog_cxt.server_mode = item->syncServerMode; +} + +/* Run from the dispatcher thread. */ +PageRedoWorker *StartPageRedoWorker(PageRedoWorker *worker) +{ + Assert(worker); + uint32 id = worker->id; + ThreadId threadId = StartWorkerThread(worker); + if (threadId == 0) { + ereport(WARNING, (errmsg("Cannot create page-redo-worker thread: %u, %m.", id))); + DestroyPageRedoWorker(worker); + return NULL; + } else { + ereport(LOG, (errmsg("StartPageRedoWorker successfully create page-redo-worker id: %u, threadId:%lu.", id, + worker->tid.thid))); + } + g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[id].threadId = threadId; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + uint32 state = pg_atomic_read_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[id].threadState)); + if (state != PAGE_REDO_WORKER_READY) { + g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[id].threadState = PAGE_REDO_WORKER_START; + } + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + return worker; +} + +void RedoWorkerQueueCallBack() +{ + RedoInterruptCallBack(); +} + +bool RedoWorkerIsUndoSpaceWorker() +{ + return g_redoWorker->isUndoSpaceWorker; +} + +/* Run from the dispatcher thread. */ +PageRedoWorker *CreateWorker(uint32 id) +{ + PageRedoWorker *tmp = (PageRedoWorker *)palloc0(sizeof(PageRedoWorker) + ONDEMAND_EXTREME_RTO_ALIGN_LEN); + PageRedoWorker *worker; + worker = (PageRedoWorker *)TYPEALIGN(ONDEMAND_EXTREME_RTO_ALIGN_LEN, tmp); + worker->selfOrinAddr = tmp; + worker->id = id; + worker->index = 0; + worker->tid.thid = InvalidTid; + worker->proc = NULL; + worker->initialServerMode = (ServerMode)t_thrd.xlog_cxt.server_mode; + worker->initialTimeLineID = t_thrd.xlog_cxt.ThisTimeLineID; + worker->expectedTLIs = t_thrd.xlog_cxt.expectedTLIs; + worker->recoveryTargetTLI = t_thrd.xlog_cxt.recoveryTargetTLI; + worker->recoveryRestoreCommand = t_thrd.xlog_cxt.recoveryRestoreCommand; + worker->ArchiveRecoveryRequested = t_thrd.xlog_cxt.ArchiveRecoveryRequested; + worker->StandbyModeRequested = t_thrd.xlog_cxt.StandbyModeRequested; + worker->InArchiveRecovery = t_thrd.xlog_cxt.InArchiveRecovery; + worker->InRecovery = t_thrd.xlog_cxt.InRecovery; + worker->ArchiveRestoreRequested = t_thrd.xlog_cxt.ArchiveRestoreRequested; + worker->minRecoveryPoint = t_thrd.xlog_cxt.minRecoveryPoint; + + worker->pendingHead = NULL; + worker->pendingTail = NULL; + worker->queue = SPSCBlockingQueueCreate(PAGE_WORK_QUEUE_SIZE, RedoWorkerQueueCallBack); + worker->lastCheckedRestartPoint = InvalidXLogRecPtr; + worker->lastReplayedEndRecPtr = InvalidXLogRecPtr; + worker->standbyState = (HotStandbyState)t_thrd.xlog_cxt.standbyState; + worker->StandbyMode = t_thrd.xlog_cxt.StandbyMode; + worker->latestObservedXid = t_thrd.storage_cxt.latestObservedXid; + worker->DataDir = t_thrd.proc_cxt.DataDir; + worker->RecentXmin = u_sess->utils_cxt.RecentXmin; + worker->xlogInvalidPages = NULL; + PosixSemaphoreInit(&worker->phaseMarker, 0); + worker->oldCtx = NULL; + worker->fullSyncFlag = 0; +#if (!defined __x86_64__) && (!defined __aarch64__) + SpinLockInit(&worker->ptrLck); +#endif + worker->parseManager.memctl.isInit = false; + worker->parseManager.parsebuffers = NULL; + return worker; +} + +/* Run from the dispatcher thread. */ +static ThreadId StartWorkerThread(PageRedoWorker *worker) +{ + worker->tid.thid = initialize_util_thread(PAGEREDO, worker); + return worker->tid.thid; +} + +/* Run from the dispatcher thread. */ +void DestroyPageRedoWorker(PageRedoWorker *worker) +{ + PosixSemaphoreDestroy(&worker->phaseMarker); + SPSCBlockingQueueDestroy(worker->queue); + XLogRedoBufferDestoryFunc(&(worker->bufferManager)); + XLogParseBufferDestoryFunc(&(worker->parseManager)); + pfree(worker->selfOrinAddr); +} + +/* automic write for lastReplayedReadRecPtr and lastReplayedEndRecPtr */ +void SetCompletedReadEndPtr(PageRedoWorker *worker, XLogRecPtr readPtr, XLogRecPtr endPtr) +{ + volatile PageRedoWorker *tmpWk = worker; +#if defined(__x86_64__) || defined(__aarch64__) + uint128_u exchange; + uint128_u current; + uint128_u compare = atomic_compare_and_swap_u128((uint128_u *)&tmpWk->lastReplayedReadRecPtr); + + Assert(sizeof(tmpWk->lastReplayedReadRecPtr) == 8); + Assert(sizeof(tmpWk->lastReplayedEndRecPtr) == 8); + + exchange.u64[0] = (uint64)readPtr; + exchange.u64[1] = (uint64)endPtr; + +loop: + current = atomic_compare_and_swap_u128((uint128_u *)&tmpWk->lastReplayedReadRecPtr, compare, exchange); + if (!UINT128_IS_EQUAL(compare, current)) { + UINT128_COPY(compare, current); + goto loop; + } +#else + SpinLockAcquire(&tmpWk->ptrLck); + tmpWk->lastReplayedReadRecPtr = readPtr; + tmpWk->lastReplayedEndRecPtr = endPtr; + SpinLockRelease(&tmpWk->ptrLck); +#endif /* __x86_64__ || __aarch64__ */ +} + +/* automic write for lastReplayedReadRecPtr and lastReplayedEndRecPtr */ +void GetCompletedReadEndPtr(PageRedoWorker *worker, XLogRecPtr *readPtr, XLogRecPtr *endPtr) +{ + volatile PageRedoWorker *tmpWk = worker; +#if defined(__x86_64__) || defined(__aarch64__) + uint128_u compare = atomic_compare_and_swap_u128((uint128_u *)&tmpWk->lastReplayedReadRecPtr); + Assert(sizeof(tmpWk->lastReplayedReadRecPtr) == 8); + Assert(sizeof(tmpWk->lastReplayedEndRecPtr) == 8); + + *readPtr = (XLogRecPtr)compare.u64[0]; + *endPtr = (XLogRecPtr)compare.u64[1]; +#else + SpinLockAcquire(&tmpWk->ptrLck); + *readPtr = tmpWk->lastReplayedReadRecPtr; + *endPtr = tmpWk->lastReplayedEndRecPtr; + SpinLockRelease(&tmpWk->ptrLck); +#endif /* __x86_64__ || __aarch64__ */ +} + +/* Run from both the dispatcher and the worker thread. */ +bool IsPageRedoWorkerProcess(int argc, char *argv[]) +{ + return strcmp(argv[1], PROCESS_TYPE_CMD_ARG) == 0; +} + +/* Run from the worker thread. */ +void AdaptArgvForPageRedoWorker(char *argv[]) +{ + if (g_AUXILIARY_TYPE_CMD_ARG[0] == 0) + sprintf_s(g_AUXILIARY_TYPE_CMD_ARG, sizeof(g_AUXILIARY_TYPE_CMD_ARG), "-x%d", PageRedoProcess); + argv[3] = g_AUXILIARY_TYPE_CMD_ARG; +} + +/* Run from the worker thread. */ +void GetThreadNameIfPageRedoWorker(int argc, char *argv[], char **threadNamePtr) +{ + if (*threadNamePtr == NULL && IsPageRedoWorkerProcess(argc, argv)) + *threadNamePtr = "PageRedoWorker"; +} + +/* Run from the worker thread. */ +uint32 GetMyPageRedoWorkerIdWithLock() +{ + bool isWorkerStarting = false; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + isWorkerStarting = ((g_instance.comm_cxt.predo_cxt.state == REDO_STARTING_BEGIN) ? true : false); + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + if (!isWorkerStarting) { + ereport(WARNING, (errmsg("GetMyPageRedoWorkerIdWithLock Page-redo-worker exit."))); + proc_exit(0); + } + + return g_redoWorker->id; +} + +/* Run from any worker thread. */ +PGPROC *GetPageRedoWorkerProc(PageRedoWorker *worker) +{ + return worker->proc; +} + +void HandlePageRedoInterrupts() +{ + if (t_thrd.page_redo_cxt.got_SIGHUP) { + t_thrd.page_redo_cxt.got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (t_thrd.page_redo_cxt.shutdown_requested) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("page worker id %u exit for request", g_redoWorker->id))); + + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[g_redoWorker->id].threadState), + PAGE_REDO_WORKER_EXIT); + + proc_exit(1); + } +} + +void ReferenceRedoItem(void *item) +{ + RedoItem *redoItem = (RedoItem *)item; + AddRefRecord(&redoItem->record); +} + +void DereferenceRedoItem(void *item) +{ + RedoItem *redoItem = (RedoItem *)item; + SubRefRecord(&redoItem->record); +} + +void ReferenceRecParseState(XLogRecParseState *recordstate) +{ + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + (void)pg_atomic_fetch_add_u32(&(descstate->refcount), 1); +} + +void DereferenceRecParseState(XLogRecParseState *recordstate) +{ + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + (void)pg_atomic_fetch_sub_u32(&(descstate->refcount), 1); +} + +#define STRUCT_CONTAINER(type, membername, ptr) ((type *)((char *)(ptr)-offsetof(type, membername))) + +#ifdef USE_ASSERT_CHECKING +void RecordBlockCheck(void *rec, XLogRecPtr curPageLsn, uint32 blockId, bool replayed) +{ + XLogReaderState *record = (XLogReaderState *)rec; + if (record->blocks[blockId].forknum != MAIN_FORKNUM) { + return; + } + + if (replayed) { + uint32 rmid = XLogRecGetRmid(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + if (curPageLsn == InvalidXLogRecPtr && (rmid == RM_HEAP2_ID || rmid == RM_HEAP_ID || rmid == RM_HEAP3_ID)) { + uint32 shiftSize = 32; + ereport(LOG, (errmsg("pass checked, record lsn:%X/%X, type: %u %u", + static_cast(record->EndRecPtr >> shiftSize), static_cast(record->EndRecPtr), + record->decoded_record->xl_rmid, record->decoded_record->xl_info))); + } else if (!(rmid == RM_HEAP2_ID && info == XLOG_HEAP2_VISIBLE) && + !(rmid == RM_HEAP_ID && info == XLOG_HEAP_NEWPAGE)) { + Assert(XLByteLE(record->EndRecPtr, curPageLsn)); + } + } + + Assert(blockId < (XLR_MAX_BLOCK_ID + 1)); + record->blocks[blockId].replayed = 1; +} + +#endif + +void AddRecordReadBlocks(void *rec, uint32 readblocks) +{ + XLogReaderState *record = (XLogReaderState *)rec; + record->readblocks += readblocks; +} + +void AddRefRecord(void *rec) +{ + pg_memory_barrier(); +#ifndef EXTREME_RTO_DEBUG + (void)pg_atomic_fetch_add_u32(&((XLogReaderState *)rec)->refcount, 1); +#else + uint32 relCount = pg_atomic_fetch_add_u32(&((XLogReaderState *)rec)->refcount, 1); + + const int stack_size = 5; + const int max_out_put_buf = 4096; + void *buffer[stack_size]; + int nptrs; + char output[max_out_put_buf]; + char **strings; + nptrs = backtrace(buffer, stack_size); + strings = backtrace_symbols(buffer, nptrs); + + int ret = sprintf_s(output, sizeof(output), "before add relcount %u lsn %X/%X call back trace: \n", relCount, + (uint32)(((XLogReaderState *)rec)->EndRecPtr >> 32), + (uint32)(((XLogReaderState *)rec)->EndRecPtr)); + securec_check_ss_c(ret, "\0", "\0"); + for (int i = 0; i < nptrs; ++i) { + ret = strcat_s(output, max_out_put_buf - strlen(output), strings[i]); + securec_check_ss_c(ret, "\0", "\0"); + ret = strcat_s(output, max_out_put_buf - strlen(output), "\n"); + securec_check_ss_c(ret, "\0", "\0"); + } + + free(strings); + ereport(LOG, (errcode(ERRCODE_DATA_CORRUPTED), errmsg(" AddRefRecord print: %s", output))); + +#endif +} + +void SubRefRecord(void *rec) +{ + pg_memory_barrier(); + Assert(((XLogReaderState *)rec)->refcount != 0); + uint32 relCount = pg_atomic_sub_fetch_u32(&((XLogReaderState *)rec)->refcount, 1); +#ifdef EXTREME_RTO_DEBUG + const int stack_size = 5; + const int max_out_put_buf = 4096; + void *buffer[stack_size]; + int nptrs; + char output[max_out_put_buf]; + char **strings; + nptrs = backtrace(buffer, stack_size); + strings = backtrace_symbols(buffer, nptrs); + + int ret = sprintf_s(output, sizeof(output), "after sub relcount %u lsn %X/%X call back trace:\n", relCount, + (uint32)(((XLogReaderState *)rec)->EndRecPtr >> 32), + (uint32)(((XLogReaderState *)rec)->EndRecPtr)); + securec_check_ss_c(ret, "\0", "\0"); + for (int i = 0; i < nptrs; ++i) { + ret = strcat_s(output, max_out_put_buf - strlen(output), strings[i]); + securec_check_ss_c(ret, "\0", "\0"); + ret = strcat_s(output, max_out_put_buf - strlen(output), "\n"); + securec_check_ss_c(ret, "\0", "\0"); + } + free(strings); + ereport(LOG, (errcode(ERRCODE_DATA_CORRUPTED), errmsg(" SubRefRecord print: %s", output))); + +#endif + + if (relCount == 0) { + RedoItem *item = STRUCT_CONTAINER(RedoItem, record, rec); + FreeRedoItem(item); + } +} + +bool BatchRedoParseItemAndDispatch(RedoItem *item) +{ + uint32 blockNum = 0; + XLogRecParseState *recordblockstate = XLogParseToBlockForExtermeRTO(&item->record, &blockNum); + if (recordblockstate == NULL) { + if (blockNum == 0) { + return false; + } + return true; /* out of mem */ + } + + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + AddPageRedoItem(myRedoLine->managerThd, recordblockstate); + return false; +} + +void BatchRedoDistributeEndMark(void) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + SendPageRedoEndMark(myRedoLine->managerThd); +} + +void BatchRedoProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + AddPageRedoItem(myRedoLine->managerThd, lsnForwarder); +} + +void BatchRedoProcCleanupMark(RedoItem *cleanupMark) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + AddPageRedoItem(myRedoLine->managerThd, cleanupMark); + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]BatchRedoProcCleanupMark has cleaned InvalidPages"))); +} + +#ifdef ENABLE_DISTRIBUTE_TEST +// inject delay to slow the process and also can be used as UT mock stub +void InjectDelayWaitRedoPageManagerQueueEmpty() +{ + const uint32 sleepTime = 1000000; + ereport(LOG, (errmsg("ProcessRedoPageManagerQueueEmpty sleep"))); + pg_usleep(sleepTime); +} +#endif + +void WaitAllRedoWorkerQueueEmpty() +{ + if ((get_real_recovery_parallelism() <= 1) || (GetBatchCount() == 0)) { + return; + } + for (uint j = 0; j < g_dispatcher->pageLineNum; ++j) { + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[j]; + + for (uint32 i = 0; i < myRedoLine->redoThdNum; ++i) { + while (!SPSCBlockingQueueIsEmpty(myRedoLine->redoThd[i]->queue)) { + RedoInterruptCallBack(); + } + } + } +} + +bool BatchRedoDistributeItems(void **eleArry, uint32 eleNum) +{ + bool parsecomplete = false; + for (uint32 i = 0; i < eleNum; i++) { + if (eleArry[i] == (void *)&g_redoEndMark) { + return true; + } else if (eleArry[i] == (void *)&g_GlobalLsnForwarder) { + BatchRedoProcLsnForwarder((RedoItem *)eleArry[i]); + } else if (eleArry[i] == (void *)&g_cleanupMark) { + BatchRedoProcCleanupMark((RedoItem *)eleArry[i]); + } else if (eleArry[i] == (void *)&g_closefdMark) { + smgrcloseall(); + } else if (eleArry[i] == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)eleArry[i]); + } else { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + RedoItem *item = (RedoItem *)eleArry[i]; + UpdateRecordGlobals(item, g_redoWorker->standbyState); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3], + g_redoWorker->timeCostList[TIME_COST_STEP_4]); + do { + parsecomplete = BatchRedoParseItemAndDispatch(item); + RedoInterruptCallBack(); + } while (parsecomplete); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + DereferenceRedoItem(item); + } + } + + return false; +} + +void BatchRedoMain() +{ + void **eleArry; + uint32 eleNum; + + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + g_parseManager = &(g_dispatcher->parseManager); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + bool isEnd = BatchRedoDistributeItems(eleArry, eleNum); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + SPSCBlockingQueuePopN(g_redoWorker->queue, eleNum); + if (isEnd) + break; + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(1); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + } + + RedoThrdWaitForExit(g_redoWorker); +} + +uint32 GetWorkerId(const RedoItemTag *redoItemTag, uint32 workerCount) +{ + if (workerCount != 0) { + return tag_hash(redoItemTag, sizeof(RedoItemTag)) % workerCount; + } + return 0; +} + +uint32 GetWorkerId(const uint32 attId, const uint32 workerCount) +{ + if (workerCount != 0) { + return attId % workerCount; + } + return 0; +} + +void RedoPageManagerDistributeToAllOneBlock(XLogRecParseState *ddlParseState) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + + ddlParseState->nextrecord = NULL; + + for (uint32 i = 0; i < WorkerNumPerMng; ++i) { + XLogRecParseState *newState = XLogParseBufferCopy(ddlParseState); + AddPageRedoItem(myRedoLine->redoThd[i], newState); + } +} + +void ReleaseRecParseState(PageRedoPipeline *myRedoLine, HTAB *redoItemHash, RedoItemHashEntry *redoItemEntry, uint32 workId) +{ + XLogRecParseState *cur_state = redoItemEntry->head; + XLogRecParseState *releaseHeadState = redoItemEntry->head; + XLogRecParseState *releaseTailState = NULL; + unsigned int del_from_hash_item_num = 0; + unsigned int new_hash; + LWLock *xlog_partition_lock; + + /* Items that have been replayed(refcount == 0) can be released */ + while (cur_state != NULL) { + ParseBufferDesc *descstate = (ParseBufferDesc *)((char *)cur_state - sizeof(ParseBufferDesc)); + unsigned int refCount = pg_atomic_read_u32(&descstate->refcount); + + if (refCount == 0) { + releaseTailState = cur_state; + del_from_hash_item_num++; + cur_state = (XLogRecParseState *)(cur_state->nextrecord); + } else { + break; + } + } + + new_hash = XlogTrackTableHashCode(&redoItemEntry->redoItemTag); + xlog_partition_lock = XlogTrackMappingPartitionLock(new_hash); + + if (del_from_hash_item_num > 0) { + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + if (releaseTailState != NULL) { + redoItemEntry->head = (XLogRecParseState *)releaseTailState->nextrecord; + releaseTailState->nextrecord = NULL; + } else { + redoItemEntry->head = NULL; + } + XLogBlockParseStateRelease(releaseHeadState); + redoItemEntry->redoItemNum = redoItemEntry->redoItemNum - del_from_hash_item_num; + LWLockRelease(xlog_partition_lock); + } + + if (redoItemEntry->redoItemNum == 0) { + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + if (hash_search(redoItemHash, (void *)&redoItemEntry->redoItemTag, HASH_REMOVE, NULL) == NULL) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), errmsg("redo item hash table corrupted"))); + } + LWLockRelease(xlog_partition_lock); + } + + return; +} + +void RedoPageManagerDistributeToRedoThd(PageRedoPipeline *myRedoLine, + HTAB *redoItemHash, RedoItemHashEntry *redoItemEntry, uint32 workId) +{ + XLogRecParseState *cur_state = redoItemEntry->head; + XLogRecParseState *distribute_head = NULL; + XLogRecParseState *distribute_tail = NULL; + int distribute_item_num = 0; + + while (cur_state != NULL) { + if (cur_state->distributeStatus != XLOG_NO_DISTRIBUTE) { + cur_state = (XLogRecParseState *)cur_state->nextrecord; + continue; + } + + if (distribute_head == NULL) { + distribute_head = cur_state; + } + cur_state->distributeStatus = XLOG_MID_DISTRIBUTE; + distribute_tail = cur_state; + distribute_item_num++; + cur_state = (XLogRecParseState *)cur_state->nextrecord; + } + + if (distribute_item_num > 0) { + distribute_head->distributeStatus = XLOG_HEAD_DISTRIBUTE; + distribute_tail->distributeStatus = XLOG_TAIL_DISTRIBUTE; + AddPageRedoItem(myRedoLine->redoThd[workId], distribute_head); + } + + return; +} + +void RedoPageManagerDistributeBlockRecord(HTAB *redoItemHash, XLogRecParseState *parsestate) +{ + static uint32 total_count = 0; + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + HTAB *curMap = redoItemHash; + hash_seq_init(&status, curMap); + + total_count++; + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + uint32 workId = GetWorkerId(&redoItemEntry->redoItemTag, WorkerNumPerMng); + ReleaseRecParseState(myRedoLine, curMap, redoItemEntry, workId); + RedoPageManagerDistributeToRedoThd(myRedoLine, curMap, redoItemEntry, workId); + } + + if (parsestate != NULL) { + RedoPageManagerDistributeToAllOneBlock(parsestate); + } +} + +void WaitCurrentPipeLineRedoWorkersQueueEmpty() +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + + for (uint32 i = 0; i < WorkerNumPerMng; ++i) { + while (!SPSCBlockingQueueIsEmpty(myRedoLine->redoThd[i]->queue)) { + RedoInterruptCallBack(); + } + } +} + +static void ReleaseReplayedInParse(PageRedoPipeline* myRedoLine, uint32 workerNum) +{ + HASH_SEQ_STATUS status; + RedoItemHashEntry *redoItemEntry = NULL; + HTAB *curMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; + hash_seq_init(&status, curMap); + + while ((redoItemEntry = (RedoItemHashEntry *)hash_seq_search(&status)) != NULL) { + if (g_redoWorker->slotId == GetSlotId(redoItemEntry->redoItemTag.rNode, 0, 0, GetBatchCount())) { + uint32 workId = GetWorkerId(&redoItemEntry->redoItemTag, workerNum); + ReleaseRecParseState(myRedoLine, curMap, redoItemEntry, workId); + } + } +} + +static void WaitAndTryReleaseWorkerReplayedRec(PageRedoPipeline *myRedoLine, uint32 workerNum) +{ + bool queueIsEmpty = false; + while (!queueIsEmpty) { + queueIsEmpty = true; + for (uint32 i = 0; i < workerNum; i++) { + if (!RedoWorkerIsIdle(myRedoLine->redoThd[i])) { + queueIsEmpty = false; + ReleaseReplayedInParse(myRedoLine, workerNum); + pg_usleep(50000L); + break; + } + } + } +} + +void DispatchEndMarkToRedoWorkerAndWait() +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = get_page_redo_worker_num_per_manager(); + for (uint32 i = 0; i < WorkerNumPerMng; ++i) + SendPageRedoEndMark(myRedoLine->redoThd[i]); + + /* Need to release the item replayed in time */ + WaitAndTryReleaseWorkerReplayedRec(myRedoLine, WorkerNumPerMng); + for (uint32 i = 0; i < myRedoLine->redoThdNum; i++) { + WaitPageRedoWorkerReachLastMark(myRedoLine->redoThd[i]); + } + ReleaseReplayedInParse(myRedoLine, WorkerNumPerMng); +} + +void RedoPageManagerDdlAction(XLogRecParseState *parsestate) +{ + switch (parsestate->blockparse.blockhead.block_valid) { + case BLOCK_DATA_DROP_DATABASE_TYPE: + xlog_db_drop(parsestate->blockparse.blockhead.end_ptr, parsestate->blockparse.blockhead.dbNode, + parsestate->blockparse.blockhead.spcNode); + break; + case BLOCK_DATA_CREATE_DATABASE_TYPE: + xlog_db_create(parsestate->blockparse.blockhead.dbNode, parsestate->blockparse.blockhead.spcNode, + parsestate->blockparse.extra_rec.blockdatabase.src_db_id, + parsestate->blockparse.extra_rec.blockdatabase.src_tablespace_id); + break; + case BLOCK_DATA_DROP_TBLSPC_TYPE: + xlog_drop_tblspc(parsestate->blockparse.blockhead.spcNode); + break; + case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: + { + Assert(0); + } + break; + case BLOCK_DATA_SEG_SPACE_DROP: + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + case BLOCK_DATA_SEG_EXTEND: + ProcSegPageCommonRedo(parsestate); + break; + default: + break; + } +} + +void RedoPageManagerSmgrClose(XLogRecParseState *parsestate) +{ + switch (parsestate->blockparse.blockhead.block_valid) { + case BLOCK_DATA_DROP_DATABASE_TYPE: + smgrcloseall(); + break; + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + ProcSegPageJustFreeChildState(parsestate); + break; + default: + break; + } +} + +void RedoPageManagerSyncDdlAction(XLogRecParseState *parsestate) +{ + /* at this monent, all worker queue is empty ,just find out which one will do it */ + uint32 expected = 0; + const uint32 pipelineNum = g_dispatcher->pageLineNum; + pg_atomic_compare_exchange_u32(&g_dispatcher->syncEnterCount, &expected, pipelineNum); + uint32 entershareCount = pg_atomic_sub_fetch_u32(&g_dispatcher->syncEnterCount, 1); + + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + if (entershareCount == 0) { + /* do actual work */ + RedoPageManagerDdlAction(parsestate); + } else { + RedoPageManagerSmgrClose(parsestate); + do { + RedoInterruptCallBack(); + entershareCount = pg_atomic_read_u32(&g_dispatcher->syncEnterCount); + } while (entershareCount != 0); + } + (void)MemoryContextSwitchTo(oldCtx); + + expected = 0; + pg_atomic_compare_exchange_u32(&g_dispatcher->syncExitCount, &expected, pipelineNum); + uint32 exitShareCount = pg_atomic_sub_fetch_u32(&g_dispatcher->syncExitCount, 1); + while (exitShareCount != 0) { + RedoInterruptCallBack(); + exitShareCount = pg_atomic_read_u32(&g_dispatcher->syncExitCount); + } + + parsestate->nextrecord = NULL; + XLogBlockParseStateRelease(parsestate); +} + +void RedoPageManagerDoDropAction(XLogRecParseState *parsestate, HTAB *hashMap) +{ + XLogRecParseState *newState = XLogParseBufferCopy(parsestate); + PRTrackClearBlock(newState, hashMap); + RedoPageManagerDistributeBlockRecord(hashMap, parsestate); + WaitCurrentPipeLineRedoWorkersQueueEmpty(); + RedoPageManagerSyncDdlAction(parsestate); +} + +void RedoPageManagerDoSmgrAction(XLogRecParseState *recordblockstate) +{ + RedoBufferInfo bufferinfo = {0}; + void *blockrecbody; + XLogBlockHead *blockhead; + + blockhead = &recordblockstate->blockparse.blockhead; + blockrecbody = &recordblockstate->blockparse.extra_rec; + + XLogBlockInitRedoBlockInfo(blockhead, &bufferinfo.blockinfo); + + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + XLogBlockDdlDoSmgrAction(blockhead, blockrecbody, &bufferinfo); + (void)MemoryContextSwitchTo(oldCtx); + + recordblockstate->nextrecord = NULL; + XLogBlockParseStateRelease(recordblockstate); +} + +void RedoPageManagerDoDataTypeAction(XLogRecParseState *parsestate, HTAB *hashMap) +{ + XLogBlockDdlParse *ddlrecparse = NULL; + XLogBlockParseGetDdlParse(parsestate, ddlrecparse); + + if (ddlrecparse->blockddltype == BLOCK_DDL_DROP_RELNODE || + ddlrecparse->blockddltype == BLOCK_DDL_TRUNCATE_RELNODE) { + XLogRecParseState *newState = XLogParseBufferCopy(parsestate); + PRTrackClearBlock(newState, hashMap); + RedoPageManagerDistributeBlockRecord(hashMap, parsestate); + WaitCurrentPipeLineRedoWorkersQueueEmpty(); + } + + RedoPageManagerDoSmgrAction(parsestate); + +} + +void PageManagerProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + + for (uint32 i = 0; i < WorkerNumPerMng; ++i) { + AddPageRedoItem(myRedoLine->redoThd[i], lsnForwarder); + } +} + +void PageManagerDistributeBcmBlock(XLogRecParseState *preState) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + uint32 workId = GetWorkerId((uint32)preState->blockparse.blockhead.forknum, WorkerNumPerMng); + AddPageRedoItem(myRedoLine->redoThd[workId], preState); +} + +void PageManagerProcCleanupMark(RedoItem *cleanupMark) +{ + PageRedoPipeline *myRedoLine = &g_dispatcher->pageLines[g_redoWorker->slotId]; + const uint32 WorkerNumPerMng = myRedoLine->redoThdNum; + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + for (uint32 i = 0; i < WorkerNumPerMng; ++i) { + AddPageRedoItem(myRedoLine->redoThd[i], cleanupMark); + } + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]PageManagerProcCleanupMark has cleaned InvalidPages"))); +} + +void PageManagerProcCheckPoint(HTAB *hashMap, XLogRecParseState *parseState) +{ + Assert(IsCheckPoint(parseState)); + RedoPageManagerDistributeBlockRecord(hashMap, parseState); + bool needWait = parseState->isFullSync; + if (needWait) { + pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); + } + + XLogBlockParseStateRelease(parseState); + uint32 val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + while (val != 0) { + RedoInterruptCallBack(); + val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + } + +#ifdef USE_ASSERT_CHECKING + int printLevel = WARNING; +#else + int printLevel = DEBUG1; +#endif + if (log_min_messages <= printLevel) { + GetThreadBufferLeakNum(); + } +} + +void PageManagerProcCreateTableSpace(HTAB *hashMap, XLogRecParseState *parseState) +{ + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + bool needWait = parseState->isFullSync; + if (needWait) { + pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); + } + + XLogBlockParseStateRelease(parseState); + uint32 val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + while (val != 0) { + RedoInterruptCallBack(); + val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + } +} + +void PageManagerProcSegFullSyncState(HTAB *hashMap, XLogRecParseState *parseState) +{ + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + WaitCurrentPipeLineRedoWorkersQueueEmpty(); + RedoPageManagerSyncDdlAction(parseState); +} + +void OnDemandPageManagerProcSegFullSyncState(XLogRecParseState *parsestate) +{ + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + RedoPageManagerDdlAction(parsestate); + (void)MemoryContextSwitchTo(oldCtx); + + parsestate->nextrecord = NULL; + XLogBlockParseStateRelease(parsestate); +} + +void PageManagerProcSegPipeLineSyncState(HTAB *hashMap, XLogRecParseState *parseState) +{ + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + WaitCurrentPipeLineRedoWorkersQueueEmpty(); + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + + RedoPageManagerDdlAction(parseState); + + (void)MemoryContextSwitchTo(oldCtx); + XLogBlockParseStateRelease(parseState); +} + +void OnDemandPageManagerProcSegPipeLineSyncState(XLogRecParseState *parseState) +{ + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + RedoPageManagerDdlAction(parseState); + (void)MemoryContextSwitchTo(oldCtx); + + XLogBlockParseStateRelease(parseState); +} + +static void WaitNextBarrier(XLogRecParseState *parseState) +{ + bool needWait = parseState->isFullSync; + if (needWait) { + pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); + } + + XLogBlockParseStateRelease(parseState); + uint32 val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + while (val != 0) { + RedoInterruptCallBack(); + val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + } +} + +static void OnDemandPageManagerRedoSegParseState(XLogRecParseState *preState) +{ + static uint32 seg_total_count = 0; + static uint32 seg_full_count = 0; + + Assert(g_redoWorker->slotId == 0); + switch (preState->blockparse.blockhead.block_valid) { + case BLOCK_DATA_SEG_EXTEND: + seg_total_count++; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + OnDemandPageManagerProcSegPipeLineSyncState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + break; + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + seg_full_count++; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + OnDemandPageManagerProcSegFullSyncState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + break; + case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: + default: + { + Assert(0); + } + break; + } +} + +void PageManagerRedoParseState(XLogRecParseState *preState) +{ + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; + + switch (preState->blockparse.blockhead.block_valid) { + case BLOCK_DATA_MAIN_DATA_TYPE: + case BLOCK_DATA_UNDO_TYPE: + case BLOCK_DATA_VM_TYPE: + case BLOCK_DATA_FSM_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + PRTrackAddBlock(preState, hashMap); + SetCompletedReadEndPtr(g_redoWorker, preState->blockparse.blockhead.start_ptr, + preState->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + break; + case BLOCK_DATA_DDL_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + RedoPageManagerDoDataTypeAction(preState, hashMap); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + break; + case BLOCK_DATA_SEG_EXTEND: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + OnDemandPageManagerRedoSegParseState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + break; + case BLOCK_DATA_DROP_DATABASE_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_5]); + RedoPageManagerDoDropAction(preState, hashMap); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5]); + break; + case BLOCK_DATA_DROP_TBLSPC_TYPE: + /* just make sure any other ddl before drop tblspc is done */ + XLogBlockParseStateRelease(preState); + break; + case BLOCK_DATA_CREATE_DATABASE_TYPE: + case BLOCK_DATA_SEG_FILE_EXTEND_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + OnDemandPageManagerRedoSegParseState(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_SEG_FULL_SYNC_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + OnDemandPageManagerRedoSegParseState(preState); + break; + case BLOCK_DATA_CREATE_TBLSPC_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); + PageManagerProcCreateTableSpace(hashMap, preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); + break; + case BLOCK_DATA_XLOG_COMMON_TYPE: + PageManagerProcCheckPoint(hashMap, preState); + break; + case BLOCK_DATA_NEWCU_TYPE: + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + PageManagerDistributeBcmBlock(preState); + break; + case BLOCK_DATA_SEG_SPACE_DROP: + case BLOCK_DATA_SEG_SPACE_SHRINK: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + RedoPageManagerDistributeBlockRecord(hashMap, preState); + WaitCurrentPipeLineRedoWorkersQueueEmpty(); + RedoPageManagerSyncDdlAction(preState); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + break; + case BLOCK_DATA_BARRIER_TYPE: + RedoPageManagerDistributeBlockRecord(hashMap, preState); + WaitNextBarrier(preState); + break; + default: + XLogBlockParseStateRelease(preState); + break; + } +} + +bool PageManagerRedoDistributeItems(void **eleArry, uint32 eleNum) +{ + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[g_redoWorker->slotId]; + + for (uint32 i = 0; i < eleNum; i++) { + if (eleArry[i] == (void *)&g_redoEndMark) { + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + return true; + } else if (eleArry[i] == (void *)&g_GlobalLsnForwarder) { + SetCompletedReadEndPtr(g_redoWorker, ((RedoItem *)eleArry[i])->record.ReadRecPtr, + ((RedoItem *)eleArry[i])->record.EndRecPtr); + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + PageManagerProcLsnForwarder((RedoItem *)eleArry[i]); + continue; + } else if (eleArry[i] == (void *)&g_cleanupMark) { + PageManagerProcCleanupMark((RedoItem *)eleArry[i]); + continue; + } else if (eleArry[i] == (void *)&g_closefdMark) { + smgrcloseall(); + continue; + } else if (eleArry[i] == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)eleArry[i]); + continue; + } + XLogRecParseState *recordblockstate = (XLogRecParseState *)eleArry[i]; + XLogRecParseState *nextState = recordblockstate; + do { + XLogRecParseState *preState = nextState; + nextState = (XLogRecParseState *)nextState->nextrecord; + preState->nextrecord = NULL; +#ifdef ENABLE_UT + TestXLogRecParseStateEventProbe(UTEST_EVENT_RTO_PAGEMGR_REDO_BEFORE_DISTRIBUTE_ITEMS, + __FUNCTION__, preState); +#endif + + PageManagerRedoParseState(preState); +#ifdef ENABLE_UT + TestXLogRecParseStateEventProbe(UTEST_EVENT_RTO_PAGEMGR_REDO_AFTER_DISTRIBUTE_ITEMS, + __FUNCTION__, preState); +#endif + } while (nextState != NULL); + } + + float4 ratio = g_dispatcher->parseManager.memctl.usedblknum / g_dispatcher->parseManager.memctl.totalblknum; + while (ratio > ONDEMAND_DISTRIBUTE_RATIO) { + ereport(WARNING, (errcode(ERRCODE_LOG), + errmsg("[On-demand] Parse buffer num approach critical value, distribute block record by force," + " slotid %d, usedblknum %d, totalblknum %d", g_redoWorker->slotId, + g_dispatcher->parseManager.memctl.usedblknum, g_dispatcher->parseManager.memctl.totalblknum))); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); + RedoPageManagerDistributeBlockRecord(hashMap, NULL); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_9]); + pg_usleep(1000000); /* 1 sec */ + ratio = g_dispatcher->parseManager.memctl.usedblknum / g_dispatcher->parseManager.memctl.totalblknum; + } + + return false; +} + +void RedoPageManagerMain() +{ + void **eleArry; + uint32 eleNum; + + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + g_parseManager = &(g_dispatcher->parseManager); + + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while (SPSCBlockingQueueGetAll(g_redoWorker->queue, &eleArry, &eleNum)) { + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + bool isEnd = PageManagerRedoDistributeItems(eleArry, eleNum); + SPSCBlockingQueuePopN(g_redoWorker->queue, eleNum); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + if (isEnd) + break; + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(5); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + } + + RedoThrdWaitForExit(g_redoWorker); +} + +bool IsXactXlog(const XLogReaderState *record) +{ + if (XLogRecGetRmid(record) != RM_XACT_ID) { + return false; + } + return true; +} + +void TrxnManagerProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); + + AddPageRedoItem(g_dispatcher->trxnLine.redoThd, lsnForwarder); +} + +void TrxnManagerProcCleanupMark(RedoItem *cleanupMark) +{ + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + AddPageRedoItem(g_dispatcher->trxnLine.redoThd, cleanupMark); + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]TrxnManagerProcCleanupMark has cleaned InvalidPages"))); +} + +bool TrxnManagerDistributeItemsBeforeEnd(RedoItem *item) +{ + bool exitFlag = false; + if (item == &g_redoEndMark) { + exitFlag = true; + } else if (item == (RedoItem *)&g_GlobalLsnForwarder) { + TrxnManagerProcLsnForwarder(item); + } else if (item == (RedoItem *)&g_cleanupMark) { + TrxnManagerProcCleanupMark(item); + } else if (item == (void *)&g_closefdMark) { + smgrcloseall(); + } else if (item == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)item); + } else { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_4]); + if (IsCheckPoint(&item->record) || IsTableSpaceDrop(&item->record) || IsTableSpaceCreate(&item->record) || + (IsXactXlog(&item->record) && XactWillRemoveRelFiles(&item->record)) || IsBarrierRelated(&item->record)) { + uint32 relCount; + do { + RedoInterruptCallBack(); + relCount = pg_atomic_read_u32(&item->record.refcount); + } while (relCount != 1); + } + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4], g_redoWorker->timeCostList[TIME_COST_STEP_5]); +#ifdef ENABLE_UT + TestXLogReaderProbe(UTEST_EVENT_RTO_TRXNMGR_DISTRIBUTE_ITEMS, + __FUNCTION__, &item->record); +#endif + AddPageRedoItem(g_dispatcher->trxnLine.redoThd, item); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5]); + } + return exitFlag; +} + +void GlobalLsnUpdate() +{ + t_thrd.xlog_cxt.standbyState = g_redoWorker->standbyState; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + if (LsnUpdate()) { + ExtremRtoUpdateMinCheckpoint(); + CheckRecoveryConsistency(); + } + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); +} + +bool LsnUpdate() +{ + XLogRecPtr minStart = MAX_XLOG_REC_PTR; + XLogRecPtr minEnd = MAX_XLOG_REC_PTR; + GetReplayedRecPtr(&minStart, &minEnd); + if ((minEnd != MAX_XLOG_REC_PTR) && (minStart != MAX_XLOG_REC_PTR)) { + SetXLogReplayRecPtr(minStart, minEnd); + return true; + } + return false; +} + +static void TrxnMangerQueueCallBack() +{ + GlobalLsnUpdate(); + HandlePageRedoInterrupts(); +} + +void TrxnManagerMain() +{ + (void)RegisterRedoInterruptCallBack(TrxnMangerQueueCallBack); + t_thrd.xlog_cxt.max_page_flush_lsn = get_global_max_page_flush_lsn(); + ereport(LOG, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("TrxnManagerMain: first get_global_max_page_flush_lsn %08X/%08X", + (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn >> 32), (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn)))); + while (true) { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + if (FORCE_FINISH_ENABLED && t_thrd.xlog_cxt.max_page_flush_lsn == MAX_XLOG_REC_PTR) { + t_thrd.xlog_cxt.max_page_flush_lsn = get_global_max_page_flush_lsn(); + if (t_thrd.xlog_cxt.max_page_flush_lsn != MAX_XLOG_REC_PTR) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("TrxnManagerMain: second get_global_max_page_flush_lsn %08X/%08X", + (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn >> 32), + (uint32)(t_thrd.xlog_cxt.max_page_flush_lsn)))); + } + } + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + if (!SPSCBlockingQueueIsEmpty(g_redoWorker->queue)) { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + RedoItem *item = (RedoItem *)SPSCBlockingQueueTop(g_redoWorker->queue); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], + g_redoWorker->timeCostList[TIME_COST_STEP_2]); + bool isEnd = TrxnManagerDistributeItemsBeforeEnd(item); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + SPSCBlockingQueuePop(g_redoWorker->queue); + + if (isEnd) { + break; + } + } else { + long sleeptime = 80 * 1000; + pg_usleep(sleeptime); + } + + ADD_ABNORMAL_POSITION(2); + RedoInterruptCallBack(); + } + + RedoThrdWaitForExit(g_redoWorker); + GlobalLsnUpdate(); +} + +void TrxnWorkerProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); +} + +void TrxnWorkNotifyRedoWorker() +{ + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; ++i) { + if (g_dispatcher->allWorkers[i]->role == REDO_PAGE_WORKER || + g_dispatcher->allWorkers[i]->role == REDO_PAGE_MNG) { + pg_atomic_write_u32(&(g_dispatcher->allWorkers[i]->fullSyncFlag), 0); + } + } +} + +void TrxnWorkrProcCleanupMark(RedoItem *cleanupMark) +{ + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]TrxnWorkrProcCleanupMark has cleaned InvalidPages"))); +} + +bool CheckFullSyncCheckpoint(RedoItem *item) +{ + if (!IsCheckPoint(&(item->record))) { + return true; + } + + if (XLByteLE(item->record.ReadRecPtr, t_thrd.shemem_ptr_cxt.ControlFile->checkPoint)) { + return true; + } + + return false; +} + +static void TrxnWorkerQueueCallBack() +{ + if (XLByteLT(t_thrd.xlog_cxt.minRecoveryPoint, g_redoWorker->minRecoveryPoint)) { + t_thrd.xlog_cxt.minRecoveryPoint = g_redoWorker->minRecoveryPoint; + } + HandlePageRedoInterrupts(); +} + +void TrxnWorkMain() +{ +#ifdef ENABLE_MOT + MOTBeginRedoRecovery(); +#endif + (void)RegisterRedoInterruptCallBack(TrxnWorkerQueueCallBack); + if (ParseStateWithoutCache()) { + XLogRedoBufferInitFunc(&(g_redoWorker->bufferManager), MAX_LOCAL_BUFF_NUM, &recordRefOperate, + RedoInterruptCallBack); + } + + RedoItem *item = NULL; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while ((item = (RedoItem *)SPSCBlockingQueueTop(g_redoWorker->queue)) != &g_redoEndMark) { + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + if ((void *)item == (void *)&g_GlobalLsnForwarder) { + TrxnWorkerProcLsnForwarder((RedoItem *)item); + SPSCBlockingQueuePop(g_redoWorker->queue); + } else if ((void *)item == (void *)&g_cleanupMark) { + TrxnWorkrProcCleanupMark((RedoItem *)item); + SPSCBlockingQueuePop(g_redoWorker->queue); + } else if ((void *)item == (void *)&g_closefdMark) { + smgrcloseall(); + SPSCBlockingQueuePop(g_redoWorker->queue); + } else if ((void *)item == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)item); + SPSCBlockingQueuePop(g_redoWorker->queue); + } else { + t_thrd.xlog_cxt.needImmediateCkp = item->needImmediateCheckpoint; + bool fullSync = item->record.isFullSync; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + ApplySinglePageRecord(item); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3], + g_redoWorker->timeCostList[TIME_COST_STEP_4]); + SPSCBlockingQueuePop(g_redoWorker->queue); + SetCompletedReadEndPtr(g_redoWorker, item->record.ReadRecPtr, item->record.EndRecPtr); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4], + g_redoWorker->timeCostList[TIME_COST_STEP_5]); + if (fullSync) { + Assert(CheckFullSyncCheckpoint(item)); + TrxnWorkNotifyRedoWorker(); + } + + if (XactHasSegpageRelFiles(&item->record)) { + uint32 expected = 1; + pg_atomic_compare_exchange_u32((volatile uint32 *)&(g_dispatcher->segpageXactDoneFlag), &expected, 0); + } + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5]); + DereferenceRedoItem(item); + RedoInterruptCallBack(); + } + ADD_ABNORMAL_POSITION(3); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2], g_redoWorker->timeCostList[TIME_COST_STEP_1]); + } + + SPSCBlockingQueuePop(g_redoWorker->queue); + if (ParseStateWithoutCache()) + XLogRedoBufferDestoryFunc(&(g_redoWorker->bufferManager)); +#ifdef ENABLE_MOT + MOTEndRedoRecovery(); +#endif +} + +void RedoPageWorkerCheckPoint(const XLogRecParseState *redoblockstate) +{ + CheckPoint checkPoint; + Assert(IsCheckPoint(redoblockstate)); + XLogSynAllBuffer(); + Assert(redoblockstate->blockparse.extra_rec.blockxlogcommon.maindatalen >= sizeof(checkPoint)); + errno_t rc = memcpy_s(&checkPoint, sizeof(checkPoint), + redoblockstate->blockparse.extra_rec.blockxlogcommon.maindata, sizeof(checkPoint)); + securec_check(rc, "\0", "\0"); + if (IsRestartPointSafe(checkPoint.redo)) { + pg_atomic_write_u64(&g_redoWorker->lastCheckedRestartPoint, redoblockstate->blockparse.blockhead.start_ptr); + } + + UpdateTimeline(&checkPoint); + +#ifdef USE_ASSERT_CHECKING + int printLevel = WARNING; +#else + int printLevel = DEBUG1; +#endif + if (log_min_messages <= printLevel) { + GetThreadBufferLeakNum(); + } +} + +void PageWorkerProcLsnForwarder(RedoItem *lsnForwarder) +{ + SetCompletedReadEndPtr(g_redoWorker, lsnForwarder->record.ReadRecPtr, lsnForwarder->record.EndRecPtr); + (void)pg_atomic_sub_fetch_u32(&lsnForwarder->record.refcount, 1); +} + +bool XlogNeedUpdateFsm(XLogRecParseState *procState, RedoBufferInfo *bufferinfo) +{ + XLogBlockHead *blockhead = &procState->blockparse.blockhead; + if (bufferinfo->pageinfo.page == NULL || !(bufferinfo->dirtyflag) || blockhead->forknum != MAIN_FORKNUM || + XLogBlockHeadGetValidInfo(blockhead) != BLOCK_DATA_MAIN_DATA_TYPE || blockhead->bucketNode != InvalidBktId) { + return false; + } + + Size freespace = PageGetHeapFreeSpace(bufferinfo->pageinfo.page); + + RmgrId rmid = XLogBlockHeadGetRmid(blockhead); + if (rmid == RM_HEAP2_ID) { + uint8 info = XLogBlockHeadGetInfo(blockhead) & ~XLR_INFO_MASK; + if (info == XLOG_HEAP2_CLEAN) { + return true; + } else if ((info == XLOG_HEAP2_MULTI_INSERT) && (freespace < BLCKSZ / 5)) { + return true; + } + + } else if (rmid == RM_HEAP_ID) { + uint8 info = XLogBlockHeadGetInfo(blockhead) & ~XLR_INFO_MASK; + if ((info == XLOG_HEAP_INSERT || info == XLOG_HEAP_UPDATE) && (freespace < BLCKSZ / 5)) { + return true; + } + } + + return false; +} + +void RedoPageWorkerRedoBcmBlock(XLogRecParseState *procState) +{ + RmgrId rmid = XLogBlockHeadGetRmid(&procState->blockparse.blockhead); + if (rmid == RM_HEAP2_ID) { + RelFileNode node; + node.spcNode = procState->blockparse.blockhead.spcNode; + node.dbNode = procState->blockparse.blockhead.dbNode; + node.relNode = procState->blockparse.blockhead.relNode; + node.bucketNode = procState->blockparse.blockhead.bucketNode; + node.opt = procState->blockparse.blockhead.opt; + XLogBlockNewCuParse *newCuParse = &(procState->blockparse.extra_rec.blocknewcu); + uint8 info = XLogBlockHeadGetInfo(&procState->blockparse.blockhead) & ~XLR_INFO_MASK; + switch (info & XLOG_HEAP_OPMASK) { + case XLOG_HEAP2_BCM: { + xl_heap_bcm *xlrec = (xl_heap_bcm *)(newCuParse->main_data); + heap_bcm_redo(xlrec, node, procState->blockparse.blockhead.end_ptr); + break; + } + case XLOG_HEAP2_LOGICAL_NEWPAGE: { + Assert(IsHeapFileNode(node)); + xl_heap_logical_newpage *xlrec = (xl_heap_logical_newpage *)(newCuParse->main_data); + char *cuData = newCuParse->main_data + SizeOfHeapLogicalNewPage; + heap_xlog_bcm_new_page(xlrec, node, cuData); + break; + } + default: + break; + } + } +} + +void RedoPageWorkerMain() +{ + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + + if (ParseStateWithoutCache()) { + XLogRedoBufferInitFunc(&(g_redoWorker->bufferManager), MAX_LOCAL_BUFF_NUM, &recordRefOperate, + RedoInterruptCallBack); + } + + XLogRecParseState *redoblockstateHead = NULL; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + while ((redoblockstateHead = (XLogRecParseState *)SPSCBlockingQueueTop(g_redoWorker->queue)) != + (XLogRecParseState *)&g_redoEndMark) { + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + if ((void *)redoblockstateHead == (void *)&g_cleanupMark) { + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + SPSCBlockingQueuePop(g_redoWorker->queue); + ereport(LOG, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]RedoPageWorkerMain has cleaned InvalidPages"))); + continue; + } + + if ((void *)redoblockstateHead == (void *)&g_closefdMark) { + smgrcloseall(); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } + + if ((void *)redoblockstateHead == (void *)&g_cleanInvalidPageMark) { + forget_range_invalid_pages((void *)redoblockstateHead); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } + if ((void *)redoblockstateHead == (void *)&g_GlobalLsnForwarder) { + PageWorkerProcLsnForwarder((RedoItem *)redoblockstateHead); + SPSCBlockingQueuePop(g_redoWorker->queue); + continue; + } + RedoBufferInfo bufferinfo = {0}; + bool notfound = false; + bool updateFsm = false; + bool needRelease = true; + + XLogRecParseState *procState = redoblockstateHead; + Assert(procState->distributeStatus != XLOG_NO_DISTRIBUTE); + + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + while (procState != NULL) { + XLogRecParseState *redoblockstate = procState; + g_redoWorker->curRedoBlockState = (XLogBlockDataParse*)(&redoblockstate->blockparse.extra_rec); + // nextrecord will be redo in backwards position + procState = (procState->distributeStatus == XLOG_TAIL_DISTRIBUTE) ? + NULL : (XLogRecParseState *)procState->nextrecord; + switch (XLogBlockHeadGetValidInfo(&redoblockstate->blockparse.blockhead)) { + case BLOCK_DATA_MAIN_DATA_TYPE: + case BLOCK_DATA_UNDO_TYPE: + case BLOCK_DATA_VM_TYPE: + case BLOCK_DATA_FSM_TYPE: + needRelease = false; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + notfound = XLogBlockRedoForExtremeRTO(redoblockstate, &bufferinfo, notfound, + g_redoWorker->timeCostList[TIME_COST_STEP_4], g_redoWorker->timeCostList[TIME_COST_STEP_5]); + DereferenceRecParseState(redoblockstate); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + break; + case BLOCK_DATA_XLOG_COMMON_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + RedoPageWorkerCheckPoint(redoblockstate); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_DDL_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + XLogForgetDDLRedo(redoblockstate); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_DROP_DATABASE_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + XLogDropDatabase(redoblockstate->blockparse.blockhead.dbNode); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_NEWCU_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + RedoPageWorkerRedoBcmBlock(redoblockstate); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_SEG_SPACE_DROP: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + XLogDropSegmentSpace(redoblockstate->blockparse.blockhead.spcNode, + redoblockstate->blockparse.blockhead.dbNode); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + case BLOCK_DATA_SEG_SPACE_SHRINK: + XLogDropSpaceShrink(redoblockstate); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + break; + case BLOCK_DATA_BARRIER_TYPE: + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + SetCompletedReadEndPtr(g_redoWorker, redoblockstate->blockparse.blockhead.start_ptr, + redoblockstate->blockparse.blockhead.end_ptr); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_6]); + break; + default: + break; + } + } + (void)MemoryContextSwitchTo(oldCtx); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); + updateFsm = XlogNeedUpdateFsm(redoblockstateHead, &bufferinfo); + bool needWait = redoblockstateHead->isFullSync; + if (needWait) { + pg_atomic_write_u32(&g_redoWorker->fullSyncFlag, 1); + } + if (needRelease) { + XLogBlockParseStateRelease(redoblockstateHead); + } + /* the same page */ + ExtremeRtoFlushBuffer(&bufferinfo, updateFsm); + SPSCBlockingQueuePop(g_redoWorker->queue); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_7]); + pg_memory_barrier(); + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + uint32 val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + while (val != 0) { + RedoInterruptCallBack(); + val = pg_atomic_read_u32(&g_redoWorker->fullSyncFlag); + } + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_8]); + RedoInterruptCallBack(); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2], g_redoWorker->timeCostList[TIME_COST_STEP_1]); + ADD_ABNORMAL_POSITION(4); + } + + SPSCBlockingQueuePop(g_redoWorker->queue); + if (ParseStateWithoutCache()) + XLogRedoBufferDestoryFunc(&(g_redoWorker->bufferManager)); +} + +void PutRecordToReadQueue(XLogReaderState *recordreader) +{ + SPSCBlockingQueuePut(g_dispatcher->readLine.readPageThd->queue, recordreader); +} + +inline void InitXLogRecordReadBuffer(XLogReaderState **initreader) +{ + XLogReaderState *newxlogreader; + XLogReaderState *readstate = g_dispatcher->rtoXlogBufState.initreader; + newxlogreader = NewReaderState(readstate); + g_dispatcher->rtoXlogBufState.initreader = NULL; + PutRecordToReadQueue(readstate); + SetCompletedReadEndPtr(g_redoWorker, readstate->ReadRecPtr, readstate->EndRecPtr); + *initreader = newxlogreader; +} + +void StartupSendFowarder(RedoItem *item) +{ + for (uint32 i = 0; i < g_dispatcher->pageLineNum; ++i) { + AddPageRedoItem(g_dispatcher->pageLines[i].batchThd, item); + } + + AddPageRedoItem(g_dispatcher->trxnLine.managerThd, item); +} + +void SendLsnFowarder() +{ + // update and read in the same thread, so no need atomic operation + g_GlobalLsnForwarder.record.ReadRecPtr = g_redoWorker->lastReplayedReadRecPtr; + g_GlobalLsnForwarder.record.EndRecPtr = g_redoWorker->lastReplayedEndRecPtr; + g_GlobalLsnForwarder.record.refcount = get_real_recovery_parallelism() - XLOG_READER_NUM; + g_GlobalLsnForwarder.record.isDecode = true; + PutRecordToReadQueue(&g_GlobalLsnForwarder.record); +} + +static inline bool ReadPageWorkerStop() +{ + return g_dispatcher->recoveryStop; +} + +void PushToWorkerLsn() +{ + static uint32 cur_recor_count = 0; + + cur_recor_count++; + + if (!IsExtremeRtoRunning()) { + return; + } + + uint32 refCount; + do { + refCount = pg_atomic_read_u32(&g_GlobalLsnForwarder.record.refcount); + RedoInterruptCallBack(); + } while (refCount != 0 && !ReadPageWorkerStop()); + cur_recor_count = 0; + SendLsnFowarder(); +} + +void ResetRtoXlogReadBuf(XLogRecPtr targetPagePtr) +{ + uint32 startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (startreadworker == WORKER_STATE_STOP) { + WalRcvCtlAcquireExitLock(); + WalRcvCtlBlock *walrcb = getCurrentWalRcvCtlBlock(); + + if (walrcb == NULL) { + WalRcvCtlReleaseExitLock(); + return; + } + + int64 walwriteoffset; + XLogRecPtr startptr; + SpinLockAcquire(&walrcb->mutex); + walwriteoffset = walrcb->walWriteOffset; + startptr = walrcb->walStart; + SpinLockRelease(&walrcb->mutex); + + if (XLByteLT(startptr, targetPagePtr)) { + WalRcvCtlReleaseExitLock(); + return; + } + + + for (uint32 i = 0; i < MAX_ALLOC_SEGNUM; ++i) { + pg_atomic_write_u32(&(g_recordbuffer->xlogsegarray[i].bufState), NONE); + } + + XLogSegNo segno; + XLByteToSeg(targetPagePtr, segno); + g_recordbuffer->xlogsegarray[g_recordbuffer->applyindex].segno = segno; + g_recordbuffer->xlogsegarray[g_recordbuffer->applyindex].readlen = targetPagePtr % XLogSegSize; + + pg_atomic_write_u32(&(g_recordbuffer->readindex), g_recordbuffer->applyindex); + pg_atomic_write_u32(&(g_recordbuffer->xlogsegarray[g_recordbuffer->readindex].bufState), APPLYING); + + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_RUN); + WalRcvCtlReleaseExitLock(); + } +} + +RecordBufferAarray *GetCurrentSegmentBuf(XLogRecPtr targetPagePtr) +{ + Assert(g_recordbuffer->applyindex < MAX_ALLOC_SEGNUM); + uint32 applyindex = g_recordbuffer->applyindex; + RecordBufferAarray *cursegbuffer = &g_recordbuffer->xlogsegarray[applyindex]; + uint32 bufState = pg_atomic_read_u32(&(cursegbuffer->bufState)); + + if (bufState != APPLYING) { + return NULL; + } + uint32 targetPageOff = (targetPagePtr % XLogSegSize); + XLogSegNo targetSegNo; + XLByteToSeg(targetPagePtr, targetSegNo); + if (cursegbuffer->segno == targetSegNo) { + cursegbuffer->segoffset = targetPageOff; + return cursegbuffer; + } else if (cursegbuffer->segno + 1 == targetSegNo) { + Assert(targetPageOff == 0); + pg_atomic_write_u32(&(cursegbuffer->bufState), APPLIED); + if ((applyindex + 1) == MAX_ALLOC_SEGNUM) { + applyindex = 0; + } else { + applyindex++; + } + + pg_atomic_write_u32(&(g_recordbuffer->applyindex), applyindex); + cursegbuffer = &g_recordbuffer->xlogsegarray[applyindex]; + bufState = pg_atomic_read_u32(&(cursegbuffer->bufState)); + if (bufState != APPLYING) { + return NULL; + } + + Assert(cursegbuffer->segno == targetSegNo); + cursegbuffer->segoffset = targetPageOff; + return cursegbuffer; + } else { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("SetReadBufferForExtRto targetPagePtr:%lu", targetPagePtr))); + DumpExtremeRtoReadBuf(); + t_thrd.xlog_cxt.failedSources |= XLOG_FROM_STREAM; + return NULL; + } +} + +static const int MAX_WAIT_TIMS = 512; + +bool XLogPageReadForExtRto(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen) +{ + uint32 startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (startreadworker == WORKER_STATE_RUN) { + RecordBufferAarray *cursegbuffer = GetCurrentSegmentBuf(targetPagePtr); + if (cursegbuffer == NULL) { + return false; + } + + uint32 readlen = pg_atomic_read_u32(&(cursegbuffer->readlen)); + + uint32 waitcount = 0; + while (readlen < (cursegbuffer->segoffset + reqLen)) { + readlen = pg_atomic_read_u32(&(cursegbuffer->readlen)); + if (waitcount >= MAX_WAIT_TIMS) { + return false; + } + waitcount++; + } + + Assert(cursegbuffer->segoffset == (targetPagePtr % XLogSegSize)); + xlogreader->readBuf = cursegbuffer->readsegbuf + cursegbuffer->segoffset; + return true; + } + + return false; +} + +void XLogReadWorkerSegFallback(XLogSegNo lastRplSegNo) +{ + errno_t errorno = EOK; + uint32 readindex = pg_atomic_read_u32(&(g_recordbuffer->readindex)); + uint32 applyindex = pg_atomic_read_u32(&(g_recordbuffer->applyindex)); + RecordBufferAarray *readseg = &g_recordbuffer->xlogsegarray[readindex]; + RecordBufferAarray *applyseg = &g_recordbuffer->xlogsegarray[applyindex]; + + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogReadWorkerSegFallback: readindex: %u, readseg[%lu,%lu,%u,%u], applyindex: %u," + "applyseg[%lu,%lu,%u,%u]", + readindex, readseg->segno, readseg->segoffset, readseg->readlen, readseg->bufState, applyindex, + applyseg->segno, applyseg->segoffset, applyseg->readlen, applyseg->bufState))); + + pg_atomic_write_u32(&(g_recordbuffer->readindex), applyindex); + pg_atomic_write_u32(&(readseg->bufState), APPLIED); + applyseg->segno = lastRplSegNo; + applyseg->readlen = applyseg->segoffset; + errorno = memset_s(applyseg->readsegbuf, XLogSegSize, 0, XLogSegSize); + securec_check(errorno, "", ""); +} + +bool CloseReadFile() +{ + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + return true; + } + return false; +} + +void DispatchCleanupMarkToAllRedoWorker() +{ + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_PAGE_WORKER) { + SPSCBlockingQueuePut(worker->queue, &g_cleanupMark); + } + } +} + +void DispatchClosefdMarkToAllRedoWorker() +{ + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_PAGE_WORKER || worker->role == REDO_PAGE_MNG || + worker->role == REDO_TRXN_MNG || worker->role == REDO_TRXN_WORKER) { + SPSCBlockingQueuePut(worker->queue, &g_closefdMark); + } + } +} + +void DispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key) +{ + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_PAGE_WORKER) { + errno_t rc = memcpy_s((char*)&g_cleanInvalidPageMark, + sizeof(RepairFileKey), (char*)&key, sizeof(RepairFileKey)); + securec_check(rc, "", ""); + SPSCBlockingQueuePut(worker->queue, &g_cleanInvalidPageMark); + } + } +} + +void WaitAllRedoWorkerIdle() +{ + instr_time startTime; + instr_time endTime; + bool allIdle = false; + INSTR_TIME_SET_CURRENT(startTime); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitAllRedoWorkerIdle begin, startTime: %lu us", INSTR_TIME_GET_MICROSEC(startTime)))); + while (!allIdle) { + allIdle = true; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_READ_WORKER || worker->role == REDO_READ_MNG) { + continue; + } + if (!RedoWorkerIsIdle(worker)) { + allIdle = false; + break; + } + } + RedoInterruptCallBack(); + } + INSTR_TIME_SET_CURRENT(endTime); + INSTR_TIME_SUBTRACT(endTime, startTime); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitAllRedoWorkerIdle end, cost time: %lu us", INSTR_TIME_GET_MICROSEC(endTime)))); +} + +void WaitAllReplayWorkerIdle() +{ + instr_time startTime; + instr_time endTime; + bool allIdle = false; + INSTR_TIME_SET_CURRENT(startTime); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitAllReplayWorkerIdle begin, startTime: %lu us", INSTR_TIME_GET_MICROSEC(startTime)))); + while (!allIdle) { + allIdle = true; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_READ_WORKER || worker->role == REDO_READ_MNG || + worker->role == REDO_READ_PAGE_WORKER) { + continue; + } + if (!RedoWorkerIsIdle(worker)) { + allIdle = false; + break; + } + } + RedoInterruptCallBack(); + } + INSTR_TIME_SET_CURRENT(endTime); + INSTR_TIME_SUBTRACT(endTime, startTime); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("WaitAllReplayWorkerIdle end, cost time: %lu us", INSTR_TIME_GET_MICROSEC(endTime)))); +} + + +void XLogForceFinish(XLogReaderState *xlogreader, TermFileData *term_file) +{ + bool closed = false; + uint32 termId = term_file->term; + XLogSegNo lastRplSegNo; + + pg_atomic_write_u32(&(ondemand_extreme_rto::g_recordbuffer->readWorkerState), ondemand_extreme_rto::WORKER_STATE_STOPPING); + while (pg_atomic_read_u32(&(ondemand_extreme_rto::g_recordbuffer->readWorkerState)) != WORKER_STATE_STOP) { + RedoInterruptCallBack(); + }; + ShutdownWalRcv(); + ShutdownDataRcv(); + pg_atomic_write_u32(&(g_recordbuffer->readSource), XLOG_FROM_PG_XLOG); + + PushToWorkerLsn(); + g_cleanupMark.record.isDecode = true; + PutRecordToReadQueue(&g_cleanupMark.record); + WaitAllRedoWorkerIdle(); + + XLogRecPtr lastRplReadLsn; + XLogRecPtr lastRplEndLsn = GetXLogReplayRecPtr(NULL, &lastRplReadLsn); + XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL); + XLogRecPtr endRecPtr = xlogreader->EndRecPtr; + ereport(WARNING, (errcode(ERRCODE_LOG), errmsg("[ForceFinish]ArchiveXlogForForceFinishRedo in extremeRTO " + "lastRplReadLsn:%08X/%08X, lastRplEndLsn:%08X/%08X, receivedUpto:%08X/%08X, ReadRecPtr:%08X/%08X, " + "EndRecPtr:%08X/%08X, readOff:%u, latestValidRecord:%08X/%08X", + (uint32)(lastRplReadLsn >> 32), (uint32)lastRplReadLsn,(uint32)(lastRplEndLsn >> 32), (uint32)lastRplEndLsn, + (uint32)(receivedUpto >> 32), (uint32)receivedUpto,(uint32)(xlogreader->ReadRecPtr >> 32), + (uint32)xlogreader->ReadRecPtr, (uint32)(xlogreader->EndRecPtr >> 32), (uint32)xlogreader->EndRecPtr, + xlogreader->readOff, (uint32)(latestValidRecord >> 32), (uint32)latestValidRecord))); + DumpExtremeRtoReadBuf(); + xlogreader->readOff = INVALID_READ_OFF; + XLByteToSeg(endRecPtr, lastRplSegNo); + XLogReadWorkerSegFallback(lastRplSegNo); + + closed = CloseReadFile(); + CopyXlogForForceFinishRedo(lastRplSegNo, termId, xlogreader, endRecPtr); + RenameXlogForForceFinishRedo(lastRplSegNo, xlogreader->readPageTLI, termId); + if (closed) { + ReOpenXlog(xlogreader); + } + t_thrd.xlog_cxt.invaildPageCnt = 0; + XLogCheckInvalidPages(); + SetSwitchHistoryFile(endRecPtr, receivedUpto, termId); + t_thrd.xlog_cxt.invaildPageCnt = 0; + set_wal_rcv_write_rec_ptr(endRecPtr); + t_thrd.xlog_cxt.receivedUpto = endRecPtr; + pg_atomic_write_u32(&(g_instance.comm_cxt.localinfo_cxt.is_finish_redo), 0); + ereport(WARNING, + (errcode(ERRCODE_LOG), errmsg("[ForceFinish]ArchiveXlogForForceFinishRedo in extremeRTO is over"))); +} + +static void DoCleanUpReadPageWorkerQueue(SPSCBlockingQueue *queue) +{ + while (!SPSCBlockingQueueIsEmpty(queue)) { + XLogReaderState *xlogreader = reinterpret_cast(SPSCBlockingQueueTake(queue)); + if (xlogreader == reinterpret_cast(&(g_redoEndMark.record)) || + xlogreader == reinterpret_cast(&(g_GlobalLsnForwarder.record)) || + xlogreader == reinterpret_cast(&(g_cleanupMark.record))) { + if (xlogreader == reinterpret_cast(&(g_GlobalLsnForwarder.record))) { + pg_atomic_write_u32(&g_GlobalLsnForwarder.record.refcount, 0); + } + continue; + } + + RedoItem *item = GetRedoItemPtr(xlogreader); + FreeRedoItem(item); + } +} + +void CleanUpReadPageWorkerQueue() +{ + SPSCBlockingQueue *queue = g_dispatcher->readLine.readPageThd->queue; + uint32 state; + do { + DoCleanUpReadPageWorkerQueue(queue); + RedoInterruptCallBack(); + state = pg_atomic_read_u32(&ondemand_extreme_rto::g_dispatcher->rtoXlogBufState.readPageWorkerState); + } while (state != WORKER_STATE_EXIT); + /* Processing the state change after the queue is cleared */ + DoCleanUpReadPageWorkerQueue(queue); +} + +void ExtremeRtoStopHere() +{ + if ((get_real_recovery_parallelism() > 1) && (GetBatchCount() > 0)) { + g_dispatcher->recoveryStop = true; + CleanUpReadPageWorkerQueue(); + } +} + +static void CheckAndDoForceFinish(XLogReaderState *xlogreader) +{ + TermFileData term_file; + if (CheckForForceFinishRedoTrigger(&term_file)) { + ereport(WARNING, + (errmsg("[ForceFinish] force finish triggered in XLogReadPageWorkerMain, ReadRecPtr:%08X/%08X, " + "EndRecPtr:%08X/%08X, StandbyMode:%u, startup_processing:%u, dummyStandbyMode:%u", + (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr, + (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)t_thrd.xlog_cxt.EndRecPtr, + t_thrd.xlog_cxt.StandbyMode, t_thrd.xlog_cxt.startup_processing, dummyStandbyMode))); + XLogForceFinish(xlogreader, &term_file); + } +} + +/* read xlog for parellel */ +void XLogReadPageWorkerMain() +{ + XLogReaderState *xlogreader = NULL; + + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + + g_recordbuffer = &g_dispatcher->rtoXlogBufState; + GetRecoveryLatch(); + /* init readstate */ + InitXLogRecordReadBuffer(&xlogreader); + + pg_atomic_write_u32(&(g_recordbuffer->readPageWorkerState), WORKER_STATE_RUN); + if (IsRecoveryDone()) { + t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; + t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; + pg_atomic_write_u32(&(g_recordbuffer->readSource), XLOG_FROM_STREAM); + } + + XLogRecord *record = XLogParallelReadNextRecord(xlogreader); + while (record != NULL) { + if (ReadPageWorkerStop()) { + break; + } + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + XLogReaderState *newxlogreader = NewReaderState(xlogreader); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3], g_redoWorker->timeCostList[TIME_COST_STEP_4]); + PutRecordToReadQueue(xlogreader); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_4], g_redoWorker->timeCostList[TIME_COST_STEP_5]); + xlogreader = newxlogreader; + + g_redoWorker->lastReplayedReadRecPtr = xlogreader->ReadRecPtr; + g_redoWorker->lastReplayedEndRecPtr = xlogreader->EndRecPtr; + + if (FORCE_FINISH_ENABLED) { + CheckAndDoForceFinish(xlogreader); + } + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_5], g_redoWorker->timeCostList[TIME_COST_STEP_1]); + record = XLogParallelReadNextRecord(xlogreader); + CountAndGetRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1], g_redoWorker->timeCostList[TIME_COST_STEP_2]); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(8); + } + + uint32 workState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + while (workState == WORKER_STATE_STOPPING) { + workState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + } + + if (workState != WORKER_STATE_EXITING && workState != WORKER_STATE_EXIT) { + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_EXITING); + } + + if (!ReadPageWorkerStop()) { + /* notify exit */ + PushToWorkerLsn(); + g_redoEndMark.record = *xlogreader; + g_redoEndMark.record.isDecode = true; + PutRecordToReadQueue((XLogReaderState *)&g_redoEndMark.record); + } + + ReLeaseRecoveryLatch(); + pg_atomic_write_u32(&(g_recordbuffer->readPageWorkerState), WORKER_STATE_EXIT); +} + +void HandleReadWorkerRunInterrupts() +{ + if (t_thrd.page_redo_cxt.got_SIGHUP) { + t_thrd.page_redo_cxt.got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } + + if (t_thrd.page_redo_cxt.shutdown_requested) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("page worker id %u exit for request", g_redoWorker->id))); + + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[g_redoWorker->id].threadState), + PAGE_REDO_WORKER_EXIT); + + proc_exit(1); + } +} + +static void InitReadBuf(uint32 bufIndex, XLogSegNo segno) +{ + if (bufIndex == MAX_ALLOC_SEGNUM) { + bufIndex = 0; + } + const uint32 sleepTime = 50; /* 50 us */ + RecordBufferAarray *nextreadseg = &g_recordbuffer->xlogsegarray[bufIndex]; + pg_memory_barrier(); + + uint32 bufState = pg_atomic_read_u32(&(nextreadseg->bufState)); + uint32 startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + while (bufState == APPLYING && startreadworker == WORKER_STATE_RUN) { + pg_usleep(sleepTime); + RedoInterruptCallBack(); + bufState = pg_atomic_read_u32(&(nextreadseg->bufState)); + startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + } + + nextreadseg->readlen = 0; + nextreadseg->segno = segno; + nextreadseg->segoffset = 0; + pg_atomic_write_u32(&(nextreadseg->bufState), APPLYING); + pg_atomic_write_u32(&(g_recordbuffer->readindex), bufIndex); +} + +static void XLogReadWorkRun() +{ + static uint32 waitcount = 0; + const uint32 sleepTime = 100; /* 50 us */ + XLogSegNo targetSegNo; + uint32 writeoffset; + uint32 reqlen; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); + uint32 readindex = pg_atomic_read_u32(&(g_recordbuffer->readindex)); + Assert(readindex < MAX_ALLOC_SEGNUM); + pg_memory_barrier(); + RecordBufferAarray *readseg = &g_recordbuffer->xlogsegarray[readindex]; + + XLogRecPtr receivedUpto = GetWalRcvWriteRecPtr(NULL); + XLByteToSeg(receivedUpto, targetSegNo); + + if (targetSegNo < readseg->segno) { + pg_usleep(sleepTime); + return; + } + + writeoffset = readseg->readlen; + if (targetSegNo != readseg->segno) { + reqlen = XLogSegSize - writeoffset; + } else { + uint32 targetPageOff = receivedUpto % XLogSegSize; + if (targetPageOff <= writeoffset) { + pg_usleep(sleepTime); + return; + } + reqlen = targetPageOff - writeoffset; + if (reqlen < XLOG_BLCKSZ) { + waitcount++; + uint32 flag = pg_atomic_read_u32(&g_readManagerTriggerFlag); + if (waitcount < MAX_WAIT_TIMS && flag == TRIGGER_NORMAL) { + pg_usleep(sleepTime); + return; + } + } + } + + waitcount = 0; + char *readBuf = readseg->readsegbuf + writeoffset; + XLogRecPtr targetSartPtr = readseg->segno * XLogSegSize + writeoffset; + uint32 readlen = 0; + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + bool result = XLogReadFromWriteBuffer(targetSartPtr, reqlen, readBuf, &readlen); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_2]); + if (!result) { + return; + } + + pg_atomic_write_u32(&(readseg->readlen), (writeoffset + readlen)); + if (readseg->readlen == XLogSegSize) { + GetRedoStartTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + InitReadBuf(readindex + 1, readseg->segno + 1); + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_3]); + } + + CountRedoTime(g_redoWorker->timeCostList[TIME_COST_STEP_1]); +} + +void XLogReadManagerResponseSignal(uint32 tgigger) +{ + switch (tgigger) { + case TRIGGER_PRIMARY: + break; + case TRIGGER_FAILOVER: + if (t_thrd.xlog_cxt.is_cascade_standby) { + SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING); + t_thrd.xlog_cxt.is_cascade_standby = false; + if (t_thrd.postmaster_cxt.HaShmData->is_cross_region) { + t_thrd.xlog_cxt.is_hadr_main_standby = true; + SpinLockAcquire(&t_thrd.postmaster_cxt.HaShmData->mutex); + t_thrd.postmaster_cxt.HaShmData->is_hadr_main_standby = true; + SpinLockRelease(&t_thrd.postmaster_cxt.HaShmData->mutex); + } + t_thrd.xlog_cxt.failover_triggered = false; + SendNotifySignal(NOTIFY_STANDBY, g_instance.pid_cxt.StartupPID); + SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("failover standby ready, notify postmaster to change state."))); + break; + } + t_thrd.xlog_cxt.failover_triggered = true; + SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("failover ready, notify postmaster to change state."))); + break; + case TRIGGER_SWITCHOVER: + t_thrd.xlog_cxt.switchover_triggered = true; + SendPostmasterSignal(PMSIGNAL_UPDATE_PROMOTING); + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("switchover ready, notify postmaster to change state."))); + break; + default: + break; + } +} + +void XLogReadManagerProcInterrupt() +{ + if (t_thrd.page_redo_cxt.shutdown_requested) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("page worker id %u exit for request", g_redoWorker->id))); + + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[g_redoWorker->id].threadState), + PAGE_REDO_WORKER_EXIT); + + proc_exit(1); + } + + if (t_thrd.page_redo_cxt.got_SIGHUP) { + t_thrd.page_redo_cxt.got_SIGHUP = false; + ProcessConfigFile(PGC_SIGHUP); + } +} + +void WaitPageReadWorkerExit() +{ + uint32 state; + do { + state = pg_atomic_read_u32(&ondemand_extreme_rto::g_dispatcher->rtoXlogBufState.readPageWorkerState); + RedoInterruptCallBack(); + } while (state != WORKER_STATE_EXIT); +} + +static void HandleExtremeRtoCascadeStandbyPromote(uint32 trigger) +{ + if (!t_thrd.xlog_cxt.is_cascade_standby || t_thrd.xlog_cxt.server_mode != STANDBY_MODE || + !IS_DN_MULTI_STANDYS_MODE()) { + return; + } + + ShutdownWalRcv(); + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone, 1); + WakeupRecovery(); + XLogReadManagerResponseSignal(trigger); + pg_atomic_write_u32(&g_startupTriggerState, TRIGGER_NORMAL); +} + +bool XLogReadManagerCheckSignal() +{ + uint32 trigger = pg_atomic_read_u32(&g_startupTriggerState); + load_server_mode(); + if (g_dispatcher->smartShutdown || trigger == TRIGGER_PRIMARY || trigger == TRIGGER_SWITCHOVER || + (trigger == TRIGGER_FAILOVER && t_thrd.xlog_cxt.server_mode == STANDBY_MODE) || + t_thrd.xlog_cxt.server_mode == PRIMARY_MODE) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogReadManagerCheckSignal: smartShutdown:%u, trigger:%u, server_mode:%u", + g_dispatcher->smartShutdown, trigger, t_thrd.xlog_cxt.server_mode))); + if (t_thrd.xlog_cxt.is_cascade_standby && t_thrd.xlog_cxt.server_mode == STANDBY_MODE && + IS_DN_MULTI_STANDYS_MODE() && (trigger == TRIGGER_SWITCHOVER || trigger == TRIGGER_FAILOVER)) { + HandleExtremeRtoCascadeStandbyPromote(trigger); + return false; + } + ShutdownWalRcv(); + if (g_dispatcher->smartShutdown) { + pg_atomic_write_u32(&g_readManagerTriggerFlag, TRIGGER_SMARTSHUTDOWN); + } else { + pg_atomic_write_u32(&g_readManagerTriggerFlag, trigger); + } + WakeupRecovery(); + WaitPageReadWorkerExit(); + XLogReadManagerResponseSignal(trigger); + return true; + } + return false; +} + +void StartRequestXLogFromStream() +{ + volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv; + XLogRecPtr expectLsn = pg_atomic_read_u64(&g_dispatcher->rtoXlogBufState.expectLsn); + if (walrcv->receivedUpto == InvalidXLogRecPtr || + (expectLsn != InvalidXLogRecPtr && XLByteLE(walrcv->receivedUpto, expectLsn))) { + uint32 readWorkerstate = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (readWorkerstate == WORKER_STATE_RUN) { + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOPPING); + } + SpinLockAcquire(&walrcv->mutex); + walrcv->receivedUpto = 0; + SpinLockRelease(&walrcv->mutex); + XLogRecPtr targetRecPtr = pg_atomic_read_u64(&g_dispatcher->rtoXlogBufState.targetRecPtr); + CheckMaxPageFlushLSN(targetRecPtr); + + uint32 shiftSize = 32; + if (IS_OBS_DISASTER_RECOVER_MODE && !IsRoachRestore()) { + ereport(LOG, (errmsg("request xlog stream from obs at %X/%X.", (uint32)(targetRecPtr >> shiftSize), + (uint32)targetRecPtr))); + RequestXLogStreaming(&targetRecPtr, 0, REPCONNTARGET_OBS, 0); + } else if (IS_SHARED_STORAGE_STANBY_MODE && !IS_SHARED_STORAGE_MAIN_STANDBY_MODE) { +#ifndef ENABLE_MULTIPLE_NODES + rename_recovery_conf_for_roach(); +#endif + ereport(LOG, (errmsg("request xlog stream from shared storage at %X/%X.", + (uint32)(targetRecPtr >> shiftSize), + (uint32)targetRecPtr))); + RequestXLogStreaming(&targetRecPtr, 0, REPCONNTARGET_SHARED_STORAGE, 0); + } else { +#ifndef ENABLE_MULTIPLE_NODES + rename_recovery_conf_for_roach(); +#endif + ereport(LOG, (errmsg("request xlog stream at %X/%X.", (uint32)(targetRecPtr >> shiftSize), + (uint32)targetRecPtr))); + RequestXLogStreaming(&targetRecPtr, t_thrd.xlog_cxt.PrimaryConnInfo, REPCONNTARGET_PRIMARY, + u_sess->attr.attr_storage.PrimarySlotName); + } + } +} + + +void XLogReadManagerMain() +{ + const long sleepShortTime = 100000L; + const long sleepLongTime = 1000000L; + g_recordbuffer = &g_dispatcher->rtoXlogBufState; + uint32 xlogReadManagerState = READ_MANAGER_RUN; + + (void)RegisterRedoInterruptCallBack(XLogReadManagerProcInterrupt); + + while (xlogReadManagerState == READ_MANAGER_RUN) { + RedoInterruptCallBack(); + XLogRecPtr replay = InvalidXLogRecPtr; + bool exitStatus = XLogReadManagerCheckSignal(); + if (exitStatus) { + break; + } + + replay = GetXLogReplayRecPtr(NULL, NULL); + handleRecoverySusPend(replay); + + xlogReadManagerState = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.xlogReadManagerState); + ADD_ABNORMAL_POSITION(7); + if (t_thrd.xlog_cxt.server_mode == STANDBY_MODE) { + uint32 readSource = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.readSource); + uint32 failSource = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.failSource); + if (readSource & XLOG_FROM_STREAM) { + uint32 disableConnectionNode = + pg_atomic_read_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node); + bool retryConnect = ((!disableConnectionNode) || (IS_SHARED_STORAGE_MODE && disableConnectionNode && + !knl_g_get_redo_finish_status() && + !pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage))); + if (!WalRcvInProgress() && g_instance.pid_cxt.WalReceiverPID == 0 && retryConnect) { + StartRequestXLogFromStream(); + } else { + if (disableConnectionNode) { + if (IS_SHARED_STORAGE_MODE && WalRcvIsRunning()) { + ShutdownWalRcv(); + } + + if (!WalRcvInProgress() && !knl_g_get_redo_finish_status()) { + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone, 1); + WakeupRecovery(); + pg_usleep(sleepLongTime); + } else if (knl_g_get_redo_finish_status()) { + pg_atomic_write_u32(&g_instance.comm_cxt.localinfo_cxt.need_disable_connection_node, false); + pg_usleep(sleepLongTime); + } + + } + } + } + + if (failSource & XLOG_FROM_STREAM) { + ShutdownWalRcv(); + pg_atomic_write_u32(&(ondemand_extreme_rto::g_dispatcher->rtoXlogBufState.failSource), 0); + } + } + pg_usleep(sleepShortTime); + } +} + +static void ReadWorkerStopCallBack(int code, Datum arg) +{ + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_EXIT); + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + } +} + +void XLogReadWorkerMain() +{ + uint32 startreadworker; + const uint32 sleepTime = 50; /* 50 us */ + + on_shmem_exit(ReadWorkerStopCallBack, 0); + (void)RegisterRedoInterruptCallBack(HandleReadWorkerRunInterrupts); + + g_recordbuffer = &g_dispatcher->rtoXlogBufState; + startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + while (startreadworker != WORKER_STATE_EXITING) { + if (startreadworker == WORKER_STATE_RUN) { + XLogReadWorkRun(); + } else { + pg_usleep(sleepTime); + } + + RedoInterruptCallBack(); + startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (startreadworker == WORKER_STATE_STOPPING) { + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOP); + } + ADD_ABNORMAL_POSITION(6); + }; + /* notify manger to exit */ + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_EXIT); +} + +int RedoMainLoop() +{ + g_redoWorker->oldCtx = MemoryContextSwitchTo(g_instance.comm_cxt.predo_cxt.parallelRedoCtx); + + instr_time startTime; + instr_time endTime; + + INSTR_TIME_SET_CURRENT(startTime); + switch (g_redoWorker->role) { + case REDO_BATCH: + BatchRedoMain(); + break; + case REDO_PAGE_MNG: + RedoPageManagerMain(); + break; + case REDO_PAGE_WORKER: + RedoPageWorkerMain(); + break; + case REDO_TRXN_MNG: + TrxnManagerMain(); + break; + case REDO_TRXN_WORKER: + TrxnWorkMain(); + break; + case REDO_READ_WORKER: + XLogReadWorkerMain(); + break; + case REDO_READ_PAGE_WORKER: + XLogReadPageWorkerMain(); + break; + case REDO_READ_MNG: + XLogReadManagerMain(); + break; + default: + break; + } + + INSTR_TIME_SET_CURRENT(endTime); + INSTR_TIME_SUBTRACT(endTime, startTime); + + /* + * We need to get the exit code here before we allow the dispatcher + * to proceed and change the exit code. + */ + int exitCode = GetDispatcherExitCode(); + g_redoWorker->xlogInvalidPages = XLogGetInvalidPages(); + g_redoWorker->committingCsnList = XLogReleaseAndGetCommittingCsnList(); + + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("worker[%d]: exitcode = %d, total elapsed = %ld", g_redoWorker->id, exitCode, + INSTR_TIME_GET_MICROSEC(endTime)))); + + (void)MemoryContextSwitchTo(g_redoWorker->oldCtx); + + return exitCode; +} + +void ParallelRedoThreadRegister() +{ + bool isWorkerStarting = false; + SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); + isWorkerStarting = ((g_instance.comm_cxt.predo_cxt.state == REDO_STARTING_BEGIN) ? true : false); + if (isWorkerStarting) { + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[g_redoWorker->id].threadState), + PAGE_REDO_WORKER_READY); + } + SpinLockRelease(&(g_instance.comm_cxt.predo_cxt.rwlock)); + if (!isWorkerStarting) { + ereport(LOG, (errmsg("ParallelRedoThreadRegister Page-redo-worker %u exit.", (uint32)isWorkerStarting))); + SetPageWorkStateByThreadId(PAGE_REDO_WORKER_EXIT); + proc_exit(0); + } +} + +const char *RedoWokerRole2Str(RedoRole role) +{ + switch (role) { + case REDO_BATCH: + return "redo_batch"; + break; + case REDO_PAGE_MNG: + return "redo_manager"; + break; + case REDO_PAGE_WORKER: + return "redo_woker"; + break; + case REDO_TRXN_MNG: + return "trxn_manager"; + break; + case REDO_TRXN_WORKER: + return "trxn_worker"; + break; + case REDO_READ_WORKER: + return "read_worker"; + break; + case REDO_READ_PAGE_WORKER: + return "read_page_woker"; + break; + case REDO_READ_MNG: + return "read_manager"; + break; + default: + return "unkown"; + break; + } +} + +void WaitStateNormal() +{ + do { + RedoInterruptCallBack(); + } while (g_instance.comm_cxt.predo_cxt.state < REDO_IN_PROGRESS); +} + +/* Run from the worker thread. */ +void ParallelRedoThreadMain() +{ + ParallelRedoThreadRegister(); + ereport(LOG, (errmsg("Page-redo-worker thread %u started, role:%u, slotId:%u.", g_redoWorker->id, + g_redoWorker->role, g_redoWorker->slotId))); + // regitster default interrupt call back + (void)RegisterRedoInterruptCallBack(HandlePageRedoInterrupts); + SetupSignalHandlers(); + InitGlobals(); + + ResourceManagerStartup(); + InitRecoveryLockHash(); + WaitStateNormal(); + EnableSyncRequestForwarding(); + + int retCode = RedoMainLoop(); + StandbyReleaseAllLocks(); + ResourceManagerStop(); + ereport(LOG, (errmsg("Page-redo-worker thread %u terminated, role:%u, slotId:%u, retcode %u.", g_redoWorker->id, + g_redoWorker->role, g_redoWorker->slotId, retCode))); + LastMarkReached(); + + pg_atomic_write_u32(&(g_instance.comm_cxt.predo_cxt.pageRedoThreadStatusList[g_redoWorker->id].threadState), + PAGE_REDO_WORKER_EXIT); + proc_exit(0); +} + +static void PageRedoShutdownHandler(SIGNAL_ARGS) +{ + t_thrd.page_redo_cxt.shutdown_requested = 1; +} + +static void PageRedoQuickDie(SIGNAL_ARGS) +{ + int status = 2; + gs_signal_setmask(&t_thrd.libpq_cxt.BlockSig, NULL); + on_exit_reset(); + exit(status); +} + +static void PageRedoUser1Handler(SIGNAL_ARGS) +{ + t_thrd.page_redo_cxt.check_repair = true; +} + +static void PageRedoUser2Handler(SIGNAL_ARGS) +{ + t_thrd.page_redo_cxt.sleep_long = 1; +} + +/* Run from the worker thread. */ +static void SetupSignalHandlers() +{ + (void)gspqsignal(SIGHUP, SigHupHandler); + (void)gspqsignal(SIGINT, SIG_IGN); + (void)gspqsignal(SIGTERM, PageRedoShutdownHandler); + (void)gspqsignal(SIGQUIT, PageRedoQuickDie); + (void)gspqsignal(SIGPIPE, SIG_IGN); + (void)gspqsignal(SIGUSR1, PageRedoUser1Handler); + (void)gspqsignal(SIGUSR2, PageRedoUser2Handler); + (void)gspqsignal(SIGCHLD, SIG_IGN); + (void)gspqsignal(SIGTTIN, SIG_IGN); + (void)gspqsignal(SIGTTOU, SIG_IGN); + (void)gspqsignal(SIGCONT, SIG_IGN); + (void)gspqsignal(SIGWINCH, SIG_IGN); + (void)gspqsignal(SIGURG, print_stack); + if (g_instance.attr.attr_storage.EnableHotStandby) { + (void)gspqsignal(SIGALRM, handle_standby_sig_alarm); /* ignored unless InHotStandby */ + } else { + (void)gspqsignal(SIGALRM, SIG_IGN); + } + + gs_signal_setmask(&t_thrd.libpq_cxt.UnBlockSig, NULL); + (void)gs_signal_unblock_sigusr2(); +} + +/* Run from the worker thread. */ +static void SigHupHandler(SIGNAL_ARGS) +{ + t_thrd.page_redo_cxt.got_SIGHUP = true; +} + +/* Run from the worker thread. */ +static void InitGlobals() +{ + t_thrd.utils_cxt.CurrentResourceOwner = ResourceOwnerCreate(NULL, "ExtremeRtoParallelRedoThread", + THREAD_GET_MEM_CXT_GROUP(MEMORY_CONTEXT_STORAGE)); + + t_thrd.xlog_cxt.server_mode = g_redoWorker->initialServerMode; + t_thrd.xlog_cxt.ThisTimeLineID = g_redoWorker->initialTimeLineID; + t_thrd.xlog_cxt.expectedTLIs = g_redoWorker->expectedTLIs; + /* apply recoveryinfo will change standbystate see UpdateRecordGlobals */ + t_thrd.xlog_cxt.standbyState = g_redoWorker->standbyState; + t_thrd.xlog_cxt.StandbyMode = g_redoWorker->StandbyMode; + t_thrd.xlog_cxt.InRecovery = true; + t_thrd.xlog_cxt.startup_processing = true; + t_thrd.proc_cxt.DataDir = g_redoWorker->DataDir; + u_sess->utils_cxt.RecentXmin = g_redoWorker->RecentXmin; + g_redoWorker->proc = t_thrd.proc; + t_thrd.storage_cxt.latestObservedXid = g_redoWorker->latestObservedXid; + t_thrd.xlog_cxt.recoveryTargetTLI = g_redoWorker->recoveryTargetTLI; + t_thrd.xlog_cxt.recoveryRestoreCommand= g_redoWorker->recoveryRestoreCommand; + t_thrd.xlog_cxt.ArchiveRecoveryRequested = g_redoWorker->ArchiveRecoveryRequested; + t_thrd.xlog_cxt.StandbyModeRequested = g_redoWorker->StandbyModeRequested; + t_thrd.xlog_cxt.InArchiveRecovery = g_redoWorker->InArchiveRecovery; + t_thrd.xlog_cxt.InRecovery = g_redoWorker->InRecovery; + t_thrd.xlog_cxt.ArchiveRestoreRequested = g_redoWorker->ArchiveRestoreRequested; + t_thrd.xlog_cxt.minRecoveryPoint = g_redoWorker->minRecoveryPoint; + t_thrd.xlog_cxt.curFileTLI = t_thrd.xlog_cxt.ThisTimeLineID; +} + +void WaitRedoWorkersQueueEmpty() +{ + bool queueIsEmpty = false; + while (!queueIsEmpty) { + queueIsEmpty = true; + for (uint32 i = 0; i < g_dispatcher->allWorkersCnt; i++) { + PageRedoWorker *worker = g_dispatcher->allWorkers[i]; + if (worker->role == REDO_TRXN_WORKER || worker->role == REDO_PAGE_WORKER) { + if (!RedoWorkerIsIdle(worker)) { + queueIsEmpty = false; + break; + } + } + } + RedoInterruptCallBack(); + } +} + +void RedoThrdWaitForExit(const PageRedoWorker *wk) +{ + uint32 sd = wk->slotId; + switch (wk->role) { + case REDO_BATCH: + SendPageRedoEndMark(g_dispatcher->pageLines[sd].managerThd); + WaitPageRedoWorkerReachLastMark(g_dispatcher->pageLines[sd].managerThd); + break; + case REDO_PAGE_MNG: + DispatchEndMarkToRedoWorkerAndWait(); + break; + case REDO_PAGE_WORKER: + break; /* Don't need to wait for anyone */ + case REDO_TRXN_MNG: + SendPageRedoEndMark(g_dispatcher->trxnLine.redoThd); + WaitRedoWorkersQueueEmpty(); + WaitPageRedoWorkerReachLastMark(g_dispatcher->trxnLine.redoThd); + break; + case REDO_TRXN_WORKER: + break; /* Don't need to wait for anyone */ + default: + break; + } +} + +/* Run from the txn worker thread. */ +XLogRecPtr GetCompletedRecPtr(PageRedoWorker *worker) +{ + return pg_atomic_read_u64(&worker->lastReplayedEndRecPtr); +} + +/* Run from the worker thread. */ +static void ApplySinglePageRecord(RedoItem *item) +{ + XLogReaderState *record = &item->record; + + MemoryContext oldCtx = MemoryContextSwitchTo(g_redoWorker->oldCtx); + ApplyRedoRecord(record); + (void)MemoryContextSwitchTo(oldCtx); +} + +/* Run from the worker thread. */ +static void LastMarkReached() +{ + PosixSemaphorePost(&g_redoWorker->phaseMarker); +} + +/* Run from the dispatcher thread. */ +void WaitPageRedoWorkerReachLastMark(PageRedoWorker *worker) +{ + PosixSemaphoreWait(&worker->phaseMarker); +} + +/* Run from the dispatcher thread. */ +void AddPageRedoItem(PageRedoWorker *worker, void *item) +{ + SPSCBlockingQueuePut(worker->queue, item); +} + +/* Run from the dispatcher thread. */ +bool SendPageRedoEndMark(PageRedoWorker *worker) +{ + return SPSCBlockingQueuePut(worker->queue, &g_redoEndMark); +} + +/* Run from the dispatcher thread. */ +bool SendPageRedoWorkerTerminateMark(PageRedoWorker *worker) +{ + return SPSCBlockingQueuePut(worker->queue, &g_terminateMark); +} + +/* Run from the txn worker thread. */ +void UpdatePageRedoWorkerStandbyState(PageRedoWorker *worker, HotStandbyState newState) +{ + /* + * Here we only save the new state into the worker struct. + * The actual update of the worker thread's state occurs inside + * the apply loop. + */ + worker->standbyState = newState; +} + +/* Run from the dispatcher thread. */ +void *GetXLogInvalidPages(PageRedoWorker *worker) +{ + return worker->xlogInvalidPages; +} + +bool RedoWorkerIsIdle(PageRedoWorker *worker) +{ + return SPSCBlockingQueueIsEmpty(worker->queue); +} + +void DumpPageRedoWorker(PageRedoWorker *worker) +{ + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]RedoWorker common info: id %u, tid %lu, " + "lastCheckedRestartPoint %lu, lastReplayedEndRecPtr %lu standbyState %u", + worker->id, worker->tid.thid, worker->lastCheckedRestartPoint, worker->lastReplayedEndRecPtr, + (uint32)worker->standbyState))); + DumpQueue(worker->queue); +} + +void DumpExtremeRtoReadBuf() +{ + if (g_dispatcher == NULL) { + return; + } + + ereport(LOG, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("DumpExtremeRtoReadBuf: startworker %u, readindex %u, applyindex %u, readSource %u, failSource %u", + g_dispatcher->rtoXlogBufState.readWorkerState, g_dispatcher->rtoXlogBufState.readindex, + g_dispatcher->rtoXlogBufState.applyindex, g_dispatcher->rtoXlogBufState.readSource, + g_dispatcher->rtoXlogBufState.failSource))); + + for (uint32 i = 0; i < MAX_ALLOC_SEGNUM; ++i) { + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("DumpExtremeRtoReadBuf: buf %u, state %u, readlen %u, segno %lu, segoffset %lu", i, + g_dispatcher->rtoXlogBufState.xlogsegarray[i].bufState, + g_dispatcher->rtoXlogBufState.xlogsegarray[i].readlen, + g_dispatcher->rtoXlogBufState.xlogsegarray[i].segno, + g_dispatcher->rtoXlogBufState.xlogsegarray[i].segoffset))); + } +} + +bool XactHasSegpageRelFiles(XLogReaderState *record) +{ + int nrels = 0; + ColFileNode *xnodes = NULL; + + if (XLogRecGetRmid(record) != RM_XACT_ID) { + return false; + } + bool compress; + XactGetRelFiles(record, &xnodes, &nrels, &compress); + + for (int32 idx = 0; idx < nrels; idx++) { + ColFileNode colFileNode; + if (compress) { + ColFileNode *colFileNodeRel = xnodes + idx; + ColFileNodeFullCopy(&colFileNode, colFileNodeRel); + } else { + ColFileNodeRel *colFileNodeRel = ((ColFileNodeRel *)xnodes) + idx; + ColFileNodeCopy(&colFileNode, colFileNodeRel); + } + if (!IsValidColForkNum(colFileNode.forknum) && IsSegmentFileNode(colFileNode.filenode)) { + return true; + } + } + + return false; +} + +} // namespace ondemand_extreme_rto \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/posix_semaphore.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/posix_semaphore.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ce56819487a5df3ce03e0dddcbd24ed04e9a9d1d --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/posix_semaphore.cpp @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * posix_semaphore.cpp + * A thin wrapper to the posix semaphore library. All system errors are + * handled within the library. + * + * The reason why this wrapper is created instead of reusing PGSemaphore + * is that PGSemaphore can only be created in the postmaster process and + * the maximum number of PGSemaphores must be known at database startup. + * For parallel recovery, each log record that touches multiple pages + * requires a new semaphore, so we need an interface that can create and + * destroy semaphores dynamically. + * + * IDENTIFICATION + * gausskernel/storage/access/transam/ondemand_extreme_rto/posix_semaphore.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "miscadmin.h" + +#include "access/ondemand_extreme_rto/posix_semaphore.h" + +namespace ondemand_extreme_rto { + +/* + * PosixSemaphoreInit + * -- Initialize a semaphore with the specified initial value. The + * semaphore must point to an allocated and zeroed structure. + * + * @in sem - The semaphore to be initialized. + * @in initValue - The initial value for the semaphore. + */ +void PosixSemaphoreInit(PosixSemaphore *sem, unsigned int initValue) +{ + Assert(!sem->initialized); + + if (sem_init(&sem->semaphore, 0, initValue) == 0) + sem->initialized = true; + else + ereport(FATAL, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("sem_init failed: %m"))); +} + +/* + * PosixSemaphoreDestroy + * -- Destroy a semaphore. It is OK to destroy an uninitialized semaphore + * if the structure had been zeroed. This is intentional to simplify + * the code that releases resources on error conditions. + * + * @in sem - The semaphore to destroy. + */ +void PosixSemaphoreDestroy(PosixSemaphore *sem) +{ + if (sem->initialized && sem_destroy(&sem->semaphore) != 0) + ereport(FATAL, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("sem_destroy failed: %m"))); + sem->initialized = false; +} + +/* + * PosixSemaphoreWait + * -- Decrement a semaphore, blocking if count would be < 0. + * + * @in sem - The semaphore to decrement. + */ +void PosixSemaphoreWait(PosixSemaphore *sem) +{ + int ret = 0; + do { + t_thrd.int_cxt.ImmediateInterruptOK = false; + CHECK_FOR_INTERRUPTS(); + } while ((ret = sem_wait(&sem->semaphore)) != 0 && errno == EINTR); + + if (ret != 0) + ereport(FATAL, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("sem_wait failed: %m"))); +} + +/* + * PosixSemaphorePost + * -- Increment a semaphore. + * + * @in sem - The semaphore to increment. + */ +void PosixSemaphorePost(PosixSemaphore *sem) +{ + int ret = 0; + while ((ret = sem_post(&sem->semaphore)) != 0 && errno == EINTR) + ; + + if (ret != 0) + ereport(FATAL, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("sem_wait failed: %m"))); +} + +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_item.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_item.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad3efa0f5177a62d1ad48ee3ee42638b19ea682a --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_item.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * redo_item.cpp + * Each RedoItem represents a log record ready to be replayed by one of + * the redo threads. To decouple the lifetime of a RedoItem from its + * log record's original XLogReaderState, contents necessary for the + * actual replay are duplicated into RedoItem's internal XLogReaderState. + + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_item.cpp + * + * ------------------------------------------------------------------------- + */ + +#include +#include + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "access/xlogrecord.h" +#include "access/xlog_internal.h" +#include "utils/palloc.h" +#include "utils/guc.h" + +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/redo_item.h" +#include "postmaster/postmaster.h" +#include "access/xlog.h" +#include "access/multi_redo_api.h" + +namespace ondemand_extreme_rto { +void DumpItem(RedoItem *item, const char *funcName) +{ + if (item == &g_redoEndMark || item == &g_terminateMark) { + return; + } + ereport(DEBUG4, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]DiagLogRedoRecord: %s, ReadRecPtr:%lu,EndRecPtr:%lu," + "imcheckpoint:%u, recordXTime:%lu," + "syncXLogReceiptSource:%d, RecentXmin:%lu, syncServerMode:%u", + funcName, item->record.ReadRecPtr, item->record.EndRecPtr, + item->needImmediateCheckpoint, item->recordXTime, + item->syncXLogReceiptSource, item->RecentXmin, item->syncServerMode))); + DiagLogRedoRecord(&(item->record), funcName); +} + + + +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bb706aca16303b2fbb81ac671b15d273cd4ae8b --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * redo_utils.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/redo_utils.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/xlogproc.h" +#include "access/ondemand_extreme_rto/batch_redo.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/redo_utils.h" +#include "storage/lock/lwlock.h" + +/* add for batch redo mem manager */ +void *OndemandXLogMemCtlInit(RedoMemManager *memctl, Size itemsize, int itemnum) +{ + void *allocdata = NULL; + RedoMemSlot *nextfreeslot = NULL; + OndemandParseAllocCtrl *ctrl; + Assert(PARSEBUFFER_SIZE == itemsize); + + allocdata = (void *)palloc(sizeof(OndemandParseAllocCtrl)); + ctrl = (OndemandParseAllocCtrl *)allocdata; + ctrl->allocNum = itemnum / ONDEMAND_MAX_PARSEBUFF_PREPALLOC; + if ((int)(ctrl->allocNum * ONDEMAND_MAX_PARSEBUFF_PREPALLOC) != itemnum) { + ctrl->allocNum++; + } + ctrl->memslotEntry = (void *)palloc(sizeof(RedoMemSlot) * itemnum); + + // palloc all parse mem entry + for (int i = 0; i < ctrl->allocNum; i++) { + ctrl->allocEntry[i] = (void *)palloc(ONDEMAND_MAX_PARSESIZE_PREPALLOC); + if (ctrl->allocEntry[i] == NULL) { + ereport(PANIC, + (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[SS] XLogMemCtlInit Allocated buffer failed!, totalblknum:%d, itemsize:%lu", + itemnum, itemsize))); + /* panic */ + } + errno_t rc = memset_s(ctrl->allocEntry[i], ONDEMAND_MAX_PARSESIZE_PREPALLOC, 0, + ONDEMAND_MAX_PARSESIZE_PREPALLOC); + securec_check(rc, "\0", "\0"); + } + memctl->totalblknum = itemnum; + memctl->usedblknum = 0; + memctl->itemsize = itemsize; + memctl->memslot = (RedoMemSlot *)ctrl->memslotEntry; + nextfreeslot = memctl->memslot; + for (int i = memctl->totalblknum; i > 0; --i) { + memctl->memslot[i - 1].buf_id = i; /* start from 1 , 0 is invalidbuffer */ + memctl->memslot[i - 1].freeNext = i - 1; + } + memctl->firstfreeslot = memctl->totalblknum; + memctl->firstreleaseslot = InvalidBuffer; + return allocdata; +} + +RedoMemSlot *OndemandXLogMemAlloc(RedoMemManager *memctl) +{ + RedoMemSlot *nextfreeslot = NULL; + do { + LWLockAcquire(OndemandXlogMemAllocLock, LW_EXCLUSIVE); + if (memctl->firstfreeslot == InvalidBuffer) { + memctl->firstfreeslot = AtomicExchangeBuffer(&memctl->firstreleaseslot, InvalidBuffer); + pg_read_barrier(); + } + + if (memctl->firstfreeslot != InvalidBuffer) { + nextfreeslot = &(memctl->memslot[memctl->firstfreeslot - 1]); + memctl->firstfreeslot = nextfreeslot->freeNext; + memctl->usedblknum++; + nextfreeslot->freeNext = InvalidBuffer; + } + LWLockRelease(OndemandXlogMemAllocLock); + + if (memctl->doInterrupt != NULL) { + memctl->doInterrupt(); + } + + } while (nextfreeslot == NULL); + + return nextfreeslot; +} + +void OndemandXLogMemRelease(RedoMemManager *memctl, Buffer bufferid) +{ + RedoMemSlot *bufferslot; + if (!RedoMemIsValid(memctl, bufferid)) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogMemRelease failed!, taoalblknum:%u, buf_id:%u", memctl->totalblknum, bufferid))); + /* panic */ + } + bufferslot = &(memctl->memslot[bufferid - 1]); + Assert(bufferslot->freeNext == InvalidBuffer); + LWLockAcquire(OndemandXlogMemAllocLock, LW_EXCLUSIVE); + Buffer oldFirst = AtomicReadBuffer(&memctl->firstreleaseslot); + pg_memory_barrier(); + do { + AtomicWriteBuffer(&bufferslot->freeNext, oldFirst); + } while (!AtomicCompareExchangeBuffer(&memctl->firstreleaseslot, &oldFirst, bufferid)); + memctl->usedblknum--; + LWLockRelease(OndemandXlogMemAllocLock); +} + + +void OndemandXLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, + InterruptFunc interruptOperte) +{ + void *allocdata = NULL; + allocdata = OndemandXLogMemCtlInit(&(parsemanager->memctl), (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)), buffernum); + parsemanager->parsebuffers = allocdata; + parsemanager->refOperate = refOperate; + parsemanager->memctl.doInterrupt = interruptOperte; + parsemanager->memctl.isInit = true; + + g_parseManager = parsemanager; + return; +} + +void OndemandXLogParseBufferDestory(RedoParseManager *parsemanager) +{ + g_parseManager = NULL; + OndemandParseAllocCtrl *ctrl = (OndemandParseAllocCtrl *)parsemanager->parsebuffers; + + if (ctrl != NULL) { + for (int i = 0; i < ctrl->allocNum; i++) { + pfree(ctrl->allocEntry[i]); + } + pfree(ctrl->memslotEntry); + pfree(ctrl); + parsemanager->parsebuffers = NULL; + } + parsemanager->memctl.isInit = false; +} + +ParseBufferDesc *OndemandGetParseMemSlot(OndemandParseAllocCtrl *ctrl, int itemIndex) +{ + int entryIndex = itemIndex / ONDEMAND_MAX_PARSEBUFF_PREPALLOC; + int entryOffset = (itemIndex - (entryIndex * ONDEMAND_MAX_PARSEBUFF_PREPALLOC)) * PARSEBUFFER_SIZE; + return (ParseBufferDesc *)((char *)ctrl->allocEntry[entryIndex] + entryOffset); +} + +XLogRecParseState *OndemandXLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, + void *record) +{ + RedoMemManager *memctl = &(parsemanager->memctl); + RedoMemSlot *allocslot = NULL; + ParseBufferDesc *descstate = NULL; + XLogRecParseState *recordstate = NULL; + + allocslot = OndemandXLogMemAlloc(memctl); + if (allocslot == NULL) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogParseBufferAlloc Allocated buffer failed!, taoalblknum:%u, usedblknum:%u", + memctl->totalblknum, memctl->usedblknum))); + return NULL; + } + + pg_read_barrier(); + Assert(allocslot->buf_id != InvalidBuffer); + Assert(memctl->itemsize == (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc))); + descstate = OndemandGetParseMemSlot((OndemandParseAllocCtrl *)parsemanager->parsebuffers, allocslot->buf_id - 1); + descstate->buff_id = allocslot->buf_id; + Assert(descstate->state == 0); + descstate->state = 1; + descstate->refcount = 0; + recordstate = (XLogRecParseState *)((char *)descstate + sizeof(ParseBufferDesc)); + recordstate->nextrecord = NULL; + recordstate->manager = parsemanager; + recordstate->refrecord = record; + recordstate->isFullSync = false; + recordstate->distributeStatus = XLOG_NO_DISTRIBUTE; + if (blkstatehead != NULL) { + recordstate->nextrecord = blkstatehead->nextrecord; + blkstatehead->nextrecord = (void *)recordstate; + } + + if (parsemanager->refOperate != NULL) + parsemanager->refOperate->refCount(record); + + return recordstate; +} + +void OndemandXLogParseBufferRelease(XLogRecParseState *recordstate) +{ + RedoMemManager *memctl = &(recordstate->manager->memctl); + ParseBufferDesc *descstate = NULL; + + descstate = (ParseBufferDesc *)((char *)recordstate - sizeof(ParseBufferDesc)); + if (!RedoMemIsValid(memctl, descstate->buff_id) || descstate->state == 0) { + ereport(PANIC, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("XLogParseBufferRelease failed!, taoalblknum:%u, buf_id:%u", memctl->totalblknum, + descstate->buff_id))); + /* panic */ + } + + descstate->state = 0; + + OndemandXLogMemRelease(memctl, descstate->buff_id); +} + +BufferDesc *RedoForOndemandExtremeRTOQuery(BufferDesc *bufHdr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode) +{ + bool hashFound = false; + bool needMarkDirty = false; + unsigned int new_hash; + LWLock *xlog_partition_lock; + Buffer buf = BufferDescriptorGetBuffer(bufHdr); + ondemand_extreme_rto::RedoItemHashEntry *redoItemEntry = NULL; + ondemand_extreme_rto::RedoItemTag redoItemTag; + XLogRecParseState *procState = NULL; + XLogBlockHead *procBlockHead = NULL; + XLogBlockHead *blockHead = NULL; + RedoBufferInfo bufferInfo; + int rc; + + INIT_REDO_ITEM_TAG(redoItemTag, bufHdr->tag.rnode, forkNum, blockNum); + + uint32 id = ondemand_extreme_rto::GetSlotId(bufHdr->tag.rnode, 0, 0, ondemand_extreme_rto::GetBatchCount()); + HTAB *hashMap = g_instance.comm_cxt.predo_cxt.redoItemHash[id]; + if (hashMap == NULL) { + ereport(ERROR, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("redo item hash table corrupted, there has invalid hashtable."))); + } + + new_hash = ondemand_extreme_rto::XlogTrackTableHashCode(&redoItemTag); + xlog_partition_lock = XlogTrackMappingPartitionLock(new_hash); + (void)LWLockAcquire(xlog_partition_lock, LW_SHARED); + redoItemEntry = (ondemand_extreme_rto::RedoItemHashEntry *)hash_search(hashMap, (void *)&redoItemTag, HASH_FIND, &hashFound); + + /* Page is already up-to-date, no need to replay. */ + if (!hashFound || redoItemEntry->redoItemNum == 0 || redoItemEntry->redoDone) { + LWLockRelease(xlog_partition_lock); + return bufHdr; + } + + // switch to exclusive lock in replay + LWLockRelease(xlog_partition_lock); + (void)LWLockAcquire(xlog_partition_lock, LW_EXCLUSIVE); + + rc = memset_s(&bufferInfo, sizeof(bufferInfo), 0, sizeof(bufferInfo)); + securec_check(rc, "\0", "\0"); + if (BufferIsValid(buf)) { + bufferInfo.buf = buf; + bufferInfo.pageinfo.page = BufferGetPage(buf); + bufferInfo.pageinfo.pagesize = BufferGetPageSize(buf); + } + + procState = (XLogRecParseState *)redoItemEntry->head; + procBlockHead = &procState->blockparse.blockhead; + + XLogBlockInitRedoBlockInfo(procBlockHead, &bufferInfo.blockinfo); + + Assert(mode == RBM_NORMAL || mode == RBM_ZERO_ON_ERROR); + + /* lock the share buffer for replaying the xlog */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + while (procState != NULL) { + XLogRecParseState *redoBlockState = procState; + ondemand_extreme_rto::ReferenceRecParseState(redoBlockState); + + procState = (XLogRecParseState *)procState->nextrecord; + procBlockHead = &procState->blockparse.blockhead; + + blockHead = &redoBlockState->blockparse.blockhead; + uint16 blockValid = XLogBlockHeadGetValidInfo(blockHead); + + if (XLogRecPtrIsInvalid(bufferInfo.lsn)) { + bufferInfo.lsn = PageGetLSN(bufferInfo.pageinfo.page); + } + if (XLByteLE(XLogBlockHeadGetLSN(blockHead), PageGetLSN(bufferInfo.pageinfo.page))) { + ondemand_extreme_rto::DereferenceRecParseState(redoBlockState); + continue; + } + + switch (blockValid) { + case BLOCK_DATA_MAIN_DATA_TYPE: + case BLOCK_DATA_UNDO_TYPE: + case BLOCK_DATA_VM_TYPE: + case BLOCK_DATA_FSM_TYPE: + needMarkDirty = true; + XlogBlockRedoForOndemandExtremeRTOQuery(redoBlockState, &bufferInfo); + break; + case BLOCK_DATA_XLOG_COMMON_TYPE: + case BLOCK_DATA_DDL_TYPE: + case BLOCK_DATA_DROP_DATABASE_TYPE: + case BLOCK_DATA_NEWCU_TYPE: + default: + Assert(0); + break; + } + + ondemand_extreme_rto::DereferenceRecParseState(redoBlockState); + } + + /* mark the latest buffer dirty */ + if (needMarkDirty) { + MarkBufferDirty(buf); + } + + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + + redoItemEntry->redoDone = true; + LWLockRelease(xlog_partition_lock); + + return bufHdr; +} + +void OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(int code) +{ + ondemand_extreme_rto::SendRecoveryEndMarkToWorkersAndWaitForReach(code); +} + +void OnDemandWaitRedoFinish() +{ + ondemand_extreme_rto::WaitRedoFinish(); +} diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/spsc_blocking_queue.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/spsc_blocking_queue.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8b9a340afe7a1c839baf5b46bd42871e82b49296 --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/spsc_blocking_queue.cpp @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * spsc_blocking_queue.cpp + * A bounded queue that supports operations that wait for the queue to + * become non-empty when retrieving an element, and wait for space to + * become available in the queue when storing an element. + * + * This structure is limited to Single-Producer/Single-Consumer, so the + * internal data can be accesses without locks. + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/spsc_blocking_queue.cpp + * + * ------------------------------------------------------------------------- + */ + +#include + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "utils/atomic.h" +#include "utils/palloc.h" + +#include "access/ondemand_extreme_rto/spsc_blocking_queue.h" +#include "access/ondemand_extreme_rto/page_redo.h" +#include "utils/elog.h" + +namespace ondemand_extreme_rto { +#define IN +#define OUT +#define INOUT + +#define POWER_OF_TWO(x) (((x) & ((x)-1)) == 0) +#define COUNT(head, tail, mask) ((uint32)(((head) - (tail)) & (mask))) +#define SPACE(head, tail, mask) ((uint32)(((tail) - ((head) + 1)) & (mask))) + +const uint32 MAX_REDO_QUE_TAKE_DELAY = 200; /* 100 us */ +const uint32 MAX_REDO_QUE_IDEL_TAKE_DELAY = 1000; +const uint32 SLEEP_COUNT_QUE_TAKE = 0xFFF; + +const int QUEUE_CAPACITY_MIN_LIMIT = 2; + +SPSCBlockingQueue *SPSCBlockingQueueCreate(uint32 capacity, CallBackFunc func) +{ + /* + * We require the capacity to be a power of 2, so index wrap can be + * handled by a bit-wise and. The actual capacity is one less than + * the specified, so the minimum capacity is 2. + */ + Assert(capacity >= QUEUE_CAPACITY_MIN_LIMIT && POWER_OF_TWO(capacity)); + + size_t allocSize = sizeof(SPSCBlockingQueue) + sizeof(void *) * capacity; + SPSCBlockingQueue *queue = (SPSCBlockingQueue *)palloc0(allocSize); + + uint32 mask = capacity - 1; + pg_atomic_init_u32(&queue->writeHead, 0); + pg_atomic_init_u32(&queue->readTail, 0); + queue->capacity = capacity; + queue->mask = mask; + queue->maxUsage = 0; + queue->totalCnt = 0; + queue->callBackFunc = func; + return queue; +} + +void SPSCBlockingQueueDestroy(SPSCBlockingQueue *queue) +{ + pfree(queue); +} + +bool SPSCBlockingQueuePut(SPSCBlockingQueue *queue, void *element) +{ + uint32 head = pg_atomic_read_u32(&queue->writeHead); + uint32 tail = pg_atomic_read_u32(&queue->readTail); + while (SPACE(head, tail, queue->mask) == 0) { + if (queue->callBackFunc != NULL) { + queue->callBackFunc(); + } + tail = pg_atomic_read_u32(&queue->readTail); + } + + /* + * Make sure the following write to the buffer happens after the read + * of the tail. Combining this with the corresponding barrier in Take() + * which guarantees that the tail is updated after reading the buffer, + * we can be sure that we cannot update a slot's value before it has + * been read. + */ + pg_memory_barrier(); + uint32 tmpCnt = COUNT(head, tail, queue->mask); + if (tmpCnt > queue->maxUsage) { + pg_atomic_write_u32(&queue->maxUsage, tmpCnt); + } + + queue->buffer[head] = element; + + /* Make sure the index is updated after the buffer has been written. */ + pg_write_barrier(); + + pg_atomic_write_u32(&queue->writeHead, (head + 1) & queue->mask); + return true; +} + +uint32 SPSCGetQueueCount(SPSCBlockingQueue *queue) +{ + uint32 head = pg_atomic_read_u32(&queue->writeHead); + uint32 tail = pg_atomic_read_u32(&queue->readTail); + return (COUNT(head, tail, queue->mask)); +} + +void *SPSCBlockingQueueTake(SPSCBlockingQueue *queue) +{ + uint32 head; + uint32 tail; + uint32 count = 0; + long sleeptime; + tail = pg_atomic_read_u32(&queue->readTail); + head = pg_atomic_read_u32(&queue->writeHead); + while (COUNT(head, tail, queue->mask) == 0) { + ++count; + /* here we sleep, let the cpu to do other important work */ + if ((count & SLEEP_COUNT_QUE_TAKE) == SLEEP_COUNT_QUE_TAKE) { + if (t_thrd.page_redo_cxt.sleep_long) + sleeptime = MAX_REDO_QUE_IDEL_TAKE_DELAY; + else + sleeptime = MAX_REDO_QUE_TAKE_DELAY; + pg_usleep(sleeptime); + } + if (queue->callBackFunc != NULL) { + queue->callBackFunc(); + } + head = pg_atomic_read_u32(&queue->writeHead); + } + + t_thrd.page_redo_cxt.sleep_long = false; + /* Make sure the buffer is read after the index. */ + pg_read_barrier(); + + void *elem = queue->buffer[tail]; + + /* Make sure the read of the buffer finishes before updating the tail. */ + pg_memory_barrier(); + + pg_atomic_write_u32(&queue->readTail, (tail + 1) & queue->mask); + return elem; +} + +bool SPSCBlockingQueueGetAll(SPSCBlockingQueue *queue, void ***eleArry, uint32 *eleNum) +{ + uint32 head; + uint32 tail; + uint32 count = 0; + long sleeptime; + + tail = pg_atomic_read_u32(&queue->readTail); + head = pg_atomic_read_u32(&queue->writeHead); + while (COUNT(head, tail, queue->mask) == 0) { + ++count; + /* here we sleep, let the cpu to do other important work */ + if ((count & SLEEP_COUNT_QUE_TAKE) == SLEEP_COUNT_QUE_TAKE) { + if (t_thrd.page_redo_cxt.sleep_long) + sleeptime = MAX_REDO_QUE_IDEL_TAKE_DELAY; + else + sleeptime = MAX_REDO_QUE_TAKE_DELAY; + pg_usleep(sleeptime); + } + if (queue->callBackFunc != NULL) { + queue->callBackFunc(); + } + head = pg_atomic_read_u32(&queue->writeHead); + } + t_thrd.page_redo_cxt.sleep_long = false; + /* Make sure the buffer is read after the index. */ + pg_read_barrier(); + head = head & (queue->mask); + tail = tail & (queue->mask); + if (head >= tail) { + *eleNum = head - tail; + } else { + *eleNum = queue->capacity - tail; + } + *eleArry = &(queue->buffer[tail]); + return true; +} + +/* for high performance, we do not put any check here. */ +void SPSCBlockingQueuePopN(SPSCBlockingQueue *queue, uint32 n) +{ + uint32 head; + uint32 tail; + uint32 queueCnt; + uint64 totalCnt = pg_atomic_read_u64(&queue->totalCnt); + tail = pg_atomic_read_u32(&queue->readTail); + head = pg_atomic_read_u32(&queue->writeHead); + queueCnt = COUNT(head, tail, queue->mask); + + /* make sure pop n is less than queueCnt, tail will not exceed capacity. */ + if (queueCnt < n || ((tail & (queue->mask)) + n) > queue->capacity) { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("SPSCBlockingQueuePopN queue error, " + "queueCnt:%u, n:%u, capacity:%u", + queueCnt, n, queue->capacity))); + return; + } + + /* Make sure the read of the buffer finishes before updating the tail. */ + pg_memory_barrier(); + pg_atomic_write_u64(&queue->totalCnt, (totalCnt + n)); + pg_atomic_write_u32(&queue->readTail, (tail + n) & queue->mask); +} + +bool SPSCBlockingQueueIsEmpty(SPSCBlockingQueue *queue) +{ + uint32 head = pg_atomic_read_u32(&queue->writeHead); + uint32 tail = pg_atomic_read_u32(&queue->readTail); + return (COUNT(head, tail, queue->mask) == 0); +} + +void *SPSCBlockingQueueTop(SPSCBlockingQueue *queue) +{ + uint32 head; + uint32 tail; + uint32 count = 0; + long sleeptime; + tail = pg_atomic_read_u32(&queue->readTail); + head = pg_atomic_read_u32(&queue->writeHead); + while (COUNT(head, tail, queue->mask) == 0) { + ++count; + /* here we sleep, let the cpu to do other important work */ + if ((count & SLEEP_COUNT_QUE_TAKE) == SLEEP_COUNT_QUE_TAKE) { + if (t_thrd.page_redo_cxt.sleep_long) + sleeptime = MAX_REDO_QUE_IDEL_TAKE_DELAY; + else + sleeptime = MAX_REDO_QUE_TAKE_DELAY; + pg_usleep(sleeptime); + } + if (queue->callBackFunc != NULL) { + queue->callBackFunc(); + } + head = pg_atomic_read_u32(&queue->writeHead); + } + t_thrd.page_redo_cxt.sleep_long = false; + pg_read_barrier(); + void *elem = queue->buffer[tail]; + return elem; +} + +void SPSCBlockingQueuePop(SPSCBlockingQueue *queue) +{ + uint32 head; + uint32 tail; + uint64 totalCnt = pg_atomic_read_u64(&queue->totalCnt); + tail = pg_atomic_read_u32(&queue->readTail); + head = pg_atomic_read_u32(&queue->writeHead); + if (COUNT(head, tail, queue->mask) == 0) { + ereport(WARNING, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), errmsg("SPSCBlockingQueuePop queue error!"))); + return; + } + + /* Make sure the read of the buffer finishes before updating the tail. */ + pg_memory_barrier(); + pg_atomic_write_u64(&queue->totalCnt, (totalCnt + 1)); + pg_atomic_write_u32(&queue->readTail, (tail + 1) & queue->mask); +} + +void DumpQueue(const SPSCBlockingQueue *queue) +{ + ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG), + errmsg("[REDO_LOG_TRACE]queue info: writeHead %u, readTail %u, capacity %u, mask %u", + queue->writeHead, queue->readTail, queue->capacity, queue->mask))); +} +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/txn_redo.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/txn_redo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a547fbbd5bfa8b8d8b12ef8c8fbeb6544435f056 --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/txn_redo.cpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * txn_redo.cpp + * TxnRedoWorker runs in the dispatcher thread to easy the management + * of transaction status and global variables. In principle, we can + * run the TxnRedoWorker in a separate thread, but we don't do it for + * now for simplicity. + * To ensure read consistency on hot-standby replicas, transactions on + * replicas must commit in the same order as the master. This is the + * main reason to use a dedicated worker to replay transaction logs. + * To ensure data consistency within a transaction, the transaction + * commit log must be replayed after all data logs for the transaction + * have been replayed by PageRedoWorkers. + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/txn_redo.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "postmaster/startup.h" +#include "access/xlog.h" +#include "utils/palloc.h" +#include "utils/guc.h" +#include "portability/instr_time.h" + +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/ondemand_extreme_rto/txn_redo.h" +#include "access/xlogreader.h" +#include "pgstat.h" +#include "storage/standby.h" +#include "catalog/pg_control.h" + +namespace ondemand_extreme_rto { + +void AddTxnRedoItem(PageRedoWorker *worker, void *item) +{ + (void)SPSCBlockingQueuePut(worker->queue, item); +} + +} // namespace ondemand_extreme_rto diff --git a/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e3c8726dfb85340951d3314b2b56a62f2125b3af --- /dev/null +++ b/src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * ------------------------------------------------------------------------- + * + * xlog_read.cpp + * + * IDENTIFICATION + * src/gausskernel/storage/access/transam/ondemand_extreme_rto/xlog_read.cpp + * + * ------------------------------------------------------------------------- + */ + +#include "access/ondemand_extreme_rto/spsc_blocking_queue.h" +#include "access/ondemand_extreme_rto/dispatcher.h" +#include "access/multi_redo_api.h" +#include "access/xlog.h" +#include "ddes/dms/ss_reform_common.h" +#include "replication/walreceiver.h" +#include "replication/dcf_replication.h" +#include "replication/shared_storage_walreceiver.h" +#include "storage/ipc.h" + +namespace ondemand_extreme_rto { +static bool DoEarlyExit() +{ + if (g_dispatcher == NULL) { + return false; + } + return g_dispatcher->recoveryStop; +} + +inline static XLogReaderState *ReadNextRecordFromQueue(int emode) +{ + char *errormsg = NULL; + SPSCBlockingQueue *linequeue = g_dispatcher->readLine.readPageThd->queue; + XLogReaderState *xlogreader = NULL; + do { + xlogreader = (XLogReaderState *)SPSCBlockingQueueTake(linequeue); + if (!xlogreader->isDecode) { + XLogRecord *record = (XLogRecord *)xlogreader->readRecordBuf; + GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); + if (!DecodeXLogRecord(xlogreader, record, &errormsg)) { + ereport(emode, + (errmsg("ReadNextRecordFromQueue %X/%X decode error, %s", (uint32)(xlogreader->EndRecPtr >> 32), + (uint32)(xlogreader->EndRecPtr), errormsg))); + + RedoItem *item = GetRedoItemPtr(xlogreader); + + FreeRedoItem(item); + + xlogreader = NULL; + } + CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); + } + + if ((void *)xlogreader == (void *)&(g_GlobalLsnForwarder.record) || + (void *)xlogreader == (void *)&(g_cleanupMark.record)) { + StartupSendFowarder(GetRedoItemPtr(xlogreader)); + xlogreader = NULL; + } + + RedoInterruptCallBack(); + } while (xlogreader == NULL); + + return xlogreader; +} + +XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode) +{ + XLogRecord *record = NULL; + XLogReaderState *xlogreader = ReadNextRecordFromQueue(emode); + + if ((void *)xlogreader != (void *)&(g_redoEndMark.record)) { + *xlogreaderptr = xlogreader; + t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; + t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; + record = (XLogRecord *)xlogreader->readRecordBuf; + } else { + *xlogreaderptr = &g_redoEndMark.record; + if (t_thrd.startup_cxt.shutdown_requested) { + proc_exit(0); + } + } + return record; +} + +void SwitchToReadXlogFromFile(XLogRecPtr pageptr) +{ + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.expectLsn, InvalidXLogRecPtr); + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOPPING); + uint32 workerState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + while (workerState != WORKER_STATE_EXIT && workerState != WORKER_STATE_STOP) { + RedoInterruptCallBack(); + workerState = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + } +} + +bool HasReceivedTrigger() +{ + uint32 trigger = pg_atomic_read_u32(&g_readManagerTriggerFlag); + if (trigger > 0) { + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); + pg_atomic_write_u32(&(g_recordbuffer->readWorkerState), WORKER_STATE_STOPPING); + return true; + } + return false; +} + +// receivedUpto indicate received new datas, but can not read,we should check +bool IsReceivingStatusOk() +{ + WalRcvCtlBlock *walrcb = getCurrentWalRcvCtlBlock(); + uint32 startreadworker = pg_atomic_read_u32(&(g_recordbuffer->readWorkerState)); + if (startreadworker == WORKER_STATE_STOP && walrcb == NULL) { + return false; + } + return true; +} + +inline XLogRecPtr CalcExpectLsn(XLogRecPtr recPtr) +{ + XLogRecPtr expectedRecPtr = recPtr; + if (recPtr % XLogSegSize == 0) { + XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD); + } else if (recPtr % XLOG_BLCKSZ == 0) { + XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD); + } + return expectedRecPtr; +} + +int ParallelXLogReadWorkBufRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, TimeLineID *readTLI) +{ + XLogRecPtr RecPtr = targetPagePtr; + uint32 targetPageOff = targetPagePtr % XLogSegSize; + + XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); + XLByteAdvance(RecPtr, reqLen); + + XLogRecPtr expectedRecPtr = CalcExpectLsn(RecPtr); + uint64 waitXLogCount = 0; + const uint64 pushLsnCount = 2; + + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.expectLsn, expectedRecPtr); + for (;;) { + // Check to see if the trigger file exists. If so, update the gaussdb state file. + if (CheckForStandbyTrigger() +#ifndef ENABLE_MULTIPLE_NODES + && IsDCFReadyOrDisabled() +#endif + ) { + SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL); + } + + /* + * If we find an invalid record in the WAL streamed from + * master, something is seriously wrong. There's little + * chance that the problem will just go away, but PANIC is + * not good for availability either, especially in hot + * standby mode. Disconnect, and retry from + * archive/pg_xlog again. The WAL in the archive should be + * identical to what was streamed, so it's unlikely that + * it helps, but one can hope... + */ + if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) { + pg_atomic_write_u32(&g_dispatcher->rtoXlogBufState.failSource, XLOG_FROM_STREAM); + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + + ResetRtoXlogReadBuf(targetPagePtr); + /* + * Walreceiver is active, so see if new data has arrived. + * + * We only advance XLogReceiptTime when we obtain fresh + * WAL from walreceiver and observe that we had already + * processed everything before the most recent "chunk" + * that it flushed to disk. In steady state where we are + * keeping up with the incoming data, XLogReceiptTime will + * be updated on each cycle. When we are behind, + * XLogReceiptTime will not advance, so the grace time + * alloted to conflicting queries will decrease. + */ + bool havedata = NewDataIsInBuf(expectedRecPtr); + if (havedata) { + /* just make sure source info is correct... */ + t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; + t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; + waitXLogCount = 0; + if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) { + t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; + } else { + t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff; + } + + /* read from wal writer buffer */ + bool readflag = XLogPageReadForExtRto(xlogreader, targetPagePtr, t_thrd.xlog_cxt.readLen); + if (readflag) { + *readTLI = t_thrd.xlog_cxt.curFileTLI; + return t_thrd.xlog_cxt.readLen; + } else { + if (!IsReceivingStatusOk()) { + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + } + } else { + if (HasReceivedTrigger()) { + return -1; + } + + uint32 waitRedoDone = pg_atomic_read_u32(&g_dispatcher->rtoXlogBufState.waitRedoDone); + if (waitRedoDone == 1 || DoEarlyExit()) { + SwitchToReadXlogFromFile(targetPagePtr); + return -1; + } + /* + * Wait for more WAL to arrive, or timeout to be reached + */ + WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L); + ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch); + if (waitXLogCount == pushLsnCount) { + PushToWorkerLsn(); + } + ++waitXLogCount; + } + + RedoInterruptCallBack(); + } + + return -1; +} + +int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, + TimeLineID *readTLI) +{ + int readLen = -1; + pg_atomic_write_u64(&g_dispatcher->rtoXlogBufState.targetRecPtr, targetRecPtr); + xlogreader->readBuf = g_dispatcher->rtoXlogBufState.readBuf; + + for (;;) { + uint32 readSource = pg_atomic_read_u32(&(g_recordbuffer->readSource)); + if (readSource & XLOG_FROM_STREAM) { + readLen = ParallelXLogReadWorkBufRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); + } else { + readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, xlogreader->readBuf, + readTLI, NULL); + } + + if (readLen > 0 || t_thrd.xlog_cxt.recoveryTriggered || !t_thrd.xlog_cxt.StandbyMode || DoEarlyExit()) { + return readLen; + } + + RedoInterruptCallBack(); + ADD_ABNORMAL_POSITION(10); + } + + return readLen; +} + +int ParallelReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) +{ + int readLen; + uint32 targetPageOff; + XLogSegNo targetSegNo; + XLogPageHeader hdr; + + Assert((pageptr % XLOG_BLCKSZ) == 0); + + XLByteToSeg(pageptr, targetSegNo); + targetPageOff = (pageptr % XLogSegSize); + + /* check whether we have all the requested data already */ + if (targetSegNo == state->readSegNo && targetPageOff == state->readOff && reqLen < (int)state->readLen) { + return state->readLen; + } + + /* + * First, read the requested data length, but at least a short page header + * so that we can validate it. + */ + readLen = ParallelXLogPageRead(state, pageptr, Max(reqLen, (int)SizeOfXLogShortPHD), state->currRecPtr, + &state->readPageTLI); + if (readLen < 0) { + goto err; + } + + Assert(readLen <= XLOG_BLCKSZ); + + /* Do we have enough data to check the header length? */ + if (readLen <= (int)SizeOfXLogShortPHD) { + goto err; + } + + Assert(readLen >= reqLen); + + hdr = (XLogPageHeader)state->readBuf; + + /* still not enough */ + if (readLen < (int)XLogPageHeaderSize(hdr)) { + readLen = ParallelXLogPageRead(state, pageptr, XLogPageHeaderSize(hdr), state->currRecPtr, &state->readPageTLI); + if (readLen < 0) { + goto err; + } + } + + /* + * Now that we know we have the full header, validate it. + */ + if (!ValidXLogPageHeader(state, pageptr, hdr)) { + goto err; + } + + /* update read state information */ + state->readSegNo = targetSegNo; + state->readOff = targetPageOff; + state->readLen = readLen; + + return readLen; + +err: + XLogReaderInvalReadState(state); + return -1; +} + +XLogRecord *ParallelReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) +{ + XLogRecord *record = NULL; + XLogRecPtr targetPagePtr; + bool randAccess = false; + uint32 len, total_len; + uint32 targetRecOff; + uint32 pageHeaderSize; + bool gotheader = false; + int readOff; + errno_t errorno = EOK; + + /* + * randAccess indicates whether to verify the previous-record pointer of + * the record we're reading. We only do this if we're reading + * sequentially, which is what we initially assume. + */ + randAccess = false; + + /* reset error state */ + *errormsg = NULL; + state->errormsg_buf[0] = '\0'; + + if (XLByteEQ(RecPtr, InvalidXLogRecPtr)) { + /* No explicit start point; read the record after the one we just read */ + RecPtr = state->EndRecPtr; + + if (XLByteEQ(state->ReadRecPtr, InvalidXLogRecPtr)) + randAccess = true; + + /* + * If at page start, we must skip over the page header using xrecoff check. + */ + if (0 == RecPtr % XLogSegSize) { + XLByteAdvance(RecPtr, SizeOfXLogLongPHD); + } else if (0 == RecPtr % XLOG_BLCKSZ) { + XLByteAdvance(RecPtr, SizeOfXLogShortPHD); + } + } else { + /* + * Caller supplied a position to start at. + * + * In this case, the passed-in record pointer should already be + * pointing to a valid record starting position. + */ + Assert(XRecOffIsValid(RecPtr)); + randAccess = true; + } + + state->currRecPtr = RecPtr; + + targetPagePtr = RecPtr - RecPtr % XLOG_BLCKSZ; + targetRecOff = RecPtr % XLOG_BLCKSZ; + + /* + * Read the page containing the record into state->readBuf. Request + * enough byte to cover the whole record header, or at least the part of + * it that fits on the same page. + */ + readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); + if (readOff < 0) { + report_invalid_record(state, "read xlog page failed at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* + * ReadPageInternal always returns at least the page header, so we can + * examine it now. + */ + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); + if (targetRecOff == 0) { + /* + * At page start, so skip over page header. + */ + RecPtr += pageHeaderSize; + targetRecOff = pageHeaderSize; + } else if (targetRecOff < pageHeaderSize) { + report_invalid_record(state, "invalid record offset at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + if ((((XLogPageHeader)state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { + report_invalid_record(state, "contrecord is requested by %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* ReadPageInternal has verified the page header */ + Assert((int)pageHeaderSize <= readOff); + + /* + * Read the record length. + * + * NB: Even though we use an XLogRecord pointer here, the whole record + * header might not fit on this page. xl_tot_len is the first field of the + * struct, so it must be on this page (the records are MAXALIGNed), but we + * cannot access any other fields until we've verified that we got the + * whole header. + */ + record = (XLogRecord *)(state->readBuf + RecPtr % XLOG_BLCKSZ); + total_len = record->xl_tot_len; + + /* + * If the whole record header is on this page, validate it immediately. + * Otherwise do just a basic sanity check on xl_tot_len, and validate the + * rest of the header after reading it from the next page. The xl_tot_len + * check is necessary here to ensure that we enter the "Need to reassemble + * record" code path below; otherwise we might fail to apply + * ValidXLogRecordHeader at all. + */ + if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) { + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) + goto err; + gotheader = true; + } else { + /* more validation should be done here */ + if (total_len < SizeOfXLogRecord || total_len >= XLogRecordMaxSize) { + report_invalid_record(state, "invalid record length at %X/%X: wanted %u, got %u", (uint32)(RecPtr >> 32), + (uint32)RecPtr, (uint32)(SizeOfXLogRecord), + total_len); + goto err; + } + gotheader = false; + } + + /* + * Enlarge readRecordBuf as needed. + */ + if (total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)) { + /* We treat this as a "bogus data" condition */ + report_invalid_record(state, "record length %u at %X/%X too long", total_len, (uint32)(RecPtr >> 32), + (uint32)RecPtr); + goto err; + } + + len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; + if (total_len > len) { + /* Need to reassemble record */ + char *contdata = NULL; + XLogPageHeader pageHeader; + char *buffer = NULL; + uint32 gotlen; + errno_t errorno = EOK; + + readOff = ParallelReadPageInternal(state, targetPagePtr, XLOG_BLCKSZ); + if (readOff < 0) { + goto err; + } + + /* Copy the first fragment of the record from the first page. */ + errorno = memcpy_s(state->readRecordBuf, len, state->readBuf + RecPtr % XLOG_BLCKSZ, len); + securec_check_c(errorno, "\0", "\0"); + buffer = state->readRecordBuf + len; + gotlen = len; + + do { + /* Calculate pointer to beginning of next page */ + XLByteAdvance(targetPagePtr, XLOG_BLCKSZ); + + /* Wait for the next page to become available */ + readOff = ParallelReadPageInternal(state, targetPagePtr, + Min(total_len - gotlen + SizeOfXLogShortPHD, XLOG_BLCKSZ)); + if (readOff < 0) + goto err; + + Assert((int)SizeOfXLogShortPHD <= readOff); + + /* Check that the continuation on next page looks valid */ + pageHeader = (XLogPageHeader)state->readBuf; + if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { + report_invalid_record(state, "there is no contrecord flag at %X/%X", (uint32)(RecPtr >> 32), + (uint32)RecPtr); + goto err; + } + + /* + * Cross-check that xlp_rem_len agrees with how much of the record + * we expect there to be left. + */ + if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { + report_invalid_record(state, "invalid contrecord length %u at %X/%X", pageHeader->xlp_rem_len, + (uint32)(RecPtr >> 32), (uint32)RecPtr); + goto err; + } + + /* Append the continuation from this page to the buffer */ + pageHeaderSize = XLogPageHeaderSize(pageHeader); + if (readOff < (int)pageHeaderSize) + readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize); + + Assert((int)pageHeaderSize <= readOff); + + contdata = (char *)state->readBuf + pageHeaderSize; + len = XLOG_BLCKSZ - pageHeaderSize; + if (pageHeader->xlp_rem_len < len) + len = pageHeader->xlp_rem_len; + + if (readOff < (int)(pageHeaderSize + len)) + readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize + len); + + errorno = memcpy_s(buffer, total_len - gotlen, (char *)contdata, len); + securec_check_c(errorno, "", ""); + buffer += len; + gotlen += len; + + /* If we just reassembled the record header, validate it. */ + if (!gotheader) { + record = (XLogRecord *)state->readRecordBuf; + if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) + goto err; + gotheader = true; + } + } while (gotlen < total_len); + + Assert(gotheader); + + record = (XLogRecord *)state->readRecordBuf; + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); + state->ReadRecPtr = RecPtr; + state->EndRecPtr = targetPagePtr; + XLByteAdvance(state->EndRecPtr, (pageHeaderSize + MAXALIGN(pageHeader->xlp_rem_len))); + } else { + /* Wait for the record data to become available */ + readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + total_len, XLOG_BLCKSZ)); + if (readOff < 0) { + goto err; + } + + /* Record does not cross a page boundary */ + if (!ValidXLogRecord(state, record, RecPtr)) + goto err; + + state->EndRecPtr = RecPtr; + XLByteAdvance(state->EndRecPtr, MAXALIGN(total_len)); + + state->ReadRecPtr = RecPtr; + errorno = memcpy_s(state->readRecordBuf, total_len, record, total_len); + securec_check_c(errorno, "\0", "\0"); + record = (XLogRecord *)state->readRecordBuf; + } + + /* + * Special processing if it's an XLOG SWITCH record + */ + if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) { + /* Pretend it extends to end of segment */ + state->EndRecPtr += XLogSegSize - 1; + state->EndRecPtr -= state->EndRecPtr % XLogSegSize; + } + + return record; +err: + + /* + * Invalidate the read state. We might read from a different source after + * failure. + */ + XLogReaderInvalReadState(state); + + if (state->errormsg_buf[0] != '\0') + *errormsg = state->errormsg_buf; + + return NULL; +} + +XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) +{ + XLogRecord *record = NULL; + + /* This is the first try to read this page. */ + t_thrd.xlog_cxt.failedSources = 0; + for (;;) { + char *errormsg = NULL; + + record = ParallelReadRecord(xlogreader, InvalidXLogRecPtr, &errormsg); + t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; + t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; + g_instance.comm_cxt.predo_cxt.redoPf.read_ptr = t_thrd.xlog_cxt.ReadRecPtr; + + if (record == NULL) { + /* + * We only end up here without a message when XLogPageRead() failed + * - in that case we already logged something. + * In StandbyMode that only happens if we have been triggered, so + * we shouldn't loop anymore in that case. + */ + if (errormsg != NULL) + ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), + (errmsg_internal("%s", errormsg) /* already translated */)); + } + + /* + * Check page TLI is one of the expected values. + */ + else if ((!timeLineInHistory(xlogreader->latestPageTLI, t_thrd.xlog_cxt.expectedTLIs)) && + (!(g_instance.attr.attr_storage.IsRoachStandbyCluster && dummyStandbyMode))) { + char fname[MAXFNAMELEN]; + XLogSegNo targetSegNo; + int32 offset; + errno_t errorno = EOK; + + XLByteToSeg(xlogreader->latestPagePtr, targetSegNo); + offset = xlogreader->latestPagePtr % XLogSegSize; + + errorno = snprintf_s(fname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", xlogreader->readPageTLI, + (uint32)((targetSegNo) / XLogSegmentsPerXLogId), + (uint32)((targetSegNo) % XLogSegmentsPerXLogId)); + securec_check_ss(errorno, "", ""); + + ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), + (errmsg("unexpected timeline ID %u in log segment %s, offset %u", xlogreader->latestPageTLI, fname, + offset))); + record = NULL; + } + + if (record != NULL) { + /* Set up lastest valid record */ + latestValidRecord = t_thrd.xlog_cxt.ReadRecPtr; + latestRecordCrc = record->xl_crc; + latestRecordLen = record->xl_tot_len; + ADD_ABNORMAL_POSITION(9); + /* Great, got a record */ + return record; + } else { + /* No valid record available from this source */ + t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; + + if (t_thrd.xlog_cxt.readFile >= 0) { + close(t_thrd.xlog_cxt.readFile); + t_thrd.xlog_cxt.readFile = -1; + } + + /* + * If archive recovery was requested, but we were still doing + * crash recovery, switch to archive recovery and retry using the + * offline archive. We have now replayed all the valid WAL in + * pg_xlog, so we are presumably now consistent. + * + * We require that there's at least some valid WAL present in + * pg_xlog, however (!fetch_ckpt). We could recover using the WAL + * from the archive, even if pg_xlog is completely empty, but we'd + * have no idea how far we'd have to replay to reach consistency. + * So err on the safe side and give up. + */ + if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested) { + t_thrd.xlog_cxt.InArchiveRecovery = true; + if (t_thrd.xlog_cxt.StandbyModeRequested) + t_thrd.xlog_cxt.StandbyMode = true; + /* construct a minrecoverypoint, update LSN */ + UpdateMinrecoveryInAchive(); + /* + * Before we retry, reset lastSourceFailed and currentSource + * so that we will check the archive next. + */ + t_thrd.xlog_cxt.failedSources = 0; + continue; + } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (t_thrd.xlog_cxt.StandbyMode && !t_thrd.xlog_cxt.recoveryTriggered && !DoEarlyExit()) + continue; + else + return NULL; + } + } +} + +} // namespace ondemand_extreme_rto \ No newline at end of file diff --git a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp index 10dc1252fb65091c5d5f7e03d5345eaa7116b1bf..615e6cf16a6ef97f2bf03919dbfb997ef6431a25 100755 --- a/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp +++ b/src/gausskernel/storage/access/transam/parallel_recovery/dispatcher.cpp @@ -1974,7 +1974,7 @@ static void **CollectStatesFromWorkers(GetStateFunc getStateFunc) return NULL; } -void redo_get_wroker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum) { SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.rwlock)); knl_parallel_redo_state state = g_instance.comm_cxt.predo_cxt.state; @@ -2017,7 +2017,7 @@ void redo_get_wroker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, Assert(*realNum == cur_pos); } -void redo_get_wroker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) +void redo_get_worker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen) { PageRedoWorker *redoWorker = NULL; SpinLockAcquire(&(g_instance.comm_cxt.predo_cxt.destroy_lock)); diff --git a/src/gausskernel/storage/access/transam/redo_statistic.cpp b/src/gausskernel/storage/access/transam/redo_statistic.cpp index f46b36ed15b481752378b17e3e7d2db4aaa72748..7537010f522b347c5e2855e2f5d977ca1de1db87 100644 --- a/src/gausskernel/storage/access/transam/redo_statistic.cpp +++ b/src/gausskernel/storage/access/transam/redo_statistic.cpp @@ -196,7 +196,7 @@ void redo_get_worker_info_text(char *info, uint32 max_info_len) RedoWorkerStatsData worker[MAX_RECOVERY_THREAD_NUM] = {0}; uint32 worker_num = 0; errno_t errorno = EOK; - GetRedoWrokerStatistic(&worker_num, worker, MinNumber((uint32)MAX_RECOVERY_THREAD_NUM, max_info_len)); + GetRedoWorkerStatistic(&worker_num, worker, MinNumber((uint32)MAX_RECOVERY_THREAD_NUM, max_info_len)); if (worker_num == 0) { errorno = snprintf_s(info, max_info_len, max_info_len - 1, "%-16s", "no redo worker"); diff --git a/src/gausskernel/storage/access/transam/xlog.cpp b/src/gausskernel/storage/access/transam/xlog.cpp index 21501735dc344c736153a5886724a1672ff21b02..eb8978df3d158413822d5457ed0daf12daac153c 100755 --- a/src/gausskernel/storage/access/transam/xlog.cpp +++ b/src/gausskernel/storage/access/transam/xlog.cpp @@ -149,9 +149,6 @@ #include "access/redo_statistic.h" #include "access/multi_redo_api.h" #include "access/parallel_recovery/dispatcher.h" -#include "access/extreme_rto/dispatcher.h" -#include "access/extreme_rto/spsc_blocking_queue.h" -#include "access/extreme_rto/page_redo.h" #include "vectorsonic/vsonichash.h" #include "ddes/dms/ss_reform_common.h" @@ -363,10 +360,8 @@ static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible); static bool XLogWritePaxos(XLogRecPtr WritePaxosRqst); #endif static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance, - bool use_lock); + bool use_lock, const char *xlog_dir); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk); -static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources); -static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void KeepFileRestoredFromArchive(const char *path, const char *xlogfname); static bool RestoreArchivedFile(char *path, const char *xlogfname, const char *recovername, off_t expectedSize); @@ -381,15 +376,12 @@ static void CleanupBackupHistory(void); static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, bool fetching_ckpt); void CheckRecoveryConsistency(void); static bool existsTimeLineHistory(TimeLineID probeTLI); -static bool rescanLatestTimeLine(void); static TimeLineID findNewestTimeLine(TimeLineID startTLI); -static bool timeLineInHistory(TimeLineID tli, List *expectedTLEs); STATIC void WriteControlFile(void); STATIC void ReadControlFile(void); static void RecoverControlFile(void); static char *str_time(pg_time_t tnow); static bool CheckForPrimaryTrigger(void); -static bool CheckForStandbyTrigger(void); #ifdef WAL_DEBUG static void xlog_outrec(StringInfo buf, XLogReaderState *record); @@ -422,21 +414,15 @@ static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); static XLogRecPtr XLogInsertRecordSingle(XLogRecData *rdata, XLogRecPtr fpw_lsn); -static bool DoEarlyExit(); - -static int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path); static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int expectReadLen, XLogRecPtr targetRecPtr, char *buf, TimeLineID *readTLI, char* xlog_path); - void ArchiveXlogForForceFinishRedo(XLogReaderState *xlogreader, TermFileData *term_file); TermFileData GetTermFileDataAndClear(void); XLogRecPtr mpfl_read_max_flush_lsn(); void mpfl_new_file(); void mpfl_ulink_file(); bool mpfl_pread_file(int fd, void *buf, int32 size, int64 offset); -int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, - TimeLineID *readTLI); +static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes); #ifdef __aarch64__ static XLogRecPtr XLogInsertRecordGroup(XLogRecData *rdata, XLogRecPtr fpw_lsn); @@ -2768,6 +2754,9 @@ static void XLogWrite(const XLogwrtRqst &WriteRqst, bool flexible) t_thrd.xlog_cxt.openLogOff += nbytes; npages = 0; + // write copy to recovery dir */ + SSOndemandXlogCopy(t_thrd.xlog_cxt.openLogSegNo, startoffset, from, nbytes); + /* * If we just wrote the whole last page of a logfile segment, * fsync the segment immediately. This avoids having to go back @@ -3720,7 +3709,7 @@ bool XLogNeedsFlush(XLogRecPtr record) * take down the system on failure). They will promote to PANIC if we are * in a critical section. */ -int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +static int XLogFileInitInternal(XLogSegNo logsegno, bool *use_existent, bool use_lock, const char *xlog_dir) { char path[MAXPGPATH]; char tmppath[MAXPGPATH]; @@ -3733,7 +3722,7 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) errno_t rc = EOK; gstrace_entry(GS_TRC_ID_XLogFileInit); - rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + rc = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((logsegno) / XLogSegmentsPerXLogId), (uint32)((logsegno) % XLogSegmentsPerXLogId)); securec_check_ss(rc, "", ""); @@ -3765,7 +3754,7 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ ereport(DEBUG2, (errmsg("creating and filling new WAL file"))); - rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", SS_XLOGDIR, gs_thread_self()); + rc = snprintf_s(tmppath, MAXPGPATH, MAXPGPATH - 1, "%s/xlogtemp.%lu", xlog_dir, gs_thread_self()); securec_check_ss(rc, "\0", "\0"); unlink(tmppath); @@ -3847,7 +3836,8 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) */ installed_segno = logsegno; max_advance = XLOGfileslop; - if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance, use_lock)) { + if (!InstallXLogFileSegment(&installed_segno, (const char *)tmppath, *use_existent, &max_advance, + use_lock, xlog_dir)) { /* * No need for any more future segments, or InstallXLogFileSegment() * failed to rename the file into place. If the rename failed, opening @@ -3873,6 +3863,11 @@ int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) return fd; } +int XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGDIR); +} + void XLogFileCutPage(char *buffer, uint32 bufLen, uint32 cpyLen) { if (bufLen != 0) { @@ -4133,13 +4128,13 @@ static void XLogFileTruncate(char *path, XLogRecPtr RecPtr) * file into place. */ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool find_free, int *max_advance, - bool use_lock) + bool use_lock, const char *xlog_dir) { char path[MAXPGPATH]; struct stat stat_buf; errno_t errorno = EOK; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4165,7 +4160,7 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool f } (*segno)++; (*max_advance)--; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((*segno) / XLogSegmentsPerXLogId), (uint32)((*segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4191,13 +4186,13 @@ static bool InstallXLogFileSegment(XLogSegNo *segno, const char *tmppath, bool f /* * Open a pre-existing logfile segment for writing. */ -int XLogFileOpen(XLogSegNo segno) +static int XLogFileOpenInternal(XLogSegNo segno, const char *xlog_dir) { char path[MAXPGPATH]; int fd; errno_t errorno = EOK; - errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", SS_XLOGDIR, t_thrd.xlog_cxt.ThisTimeLineID, + errorno = snprintf_s(path, MAXPGPATH, MAXPGPATH - 1, "%s/%08X%08X%08X", xlog_dir, t_thrd.xlog_cxt.ThisTimeLineID, (uint32)((segno) / XLogSegmentsPerXLogId), (uint32)((segno) % XLogSegmentsPerXLogId)); securec_check_ss(errorno, "", ""); @@ -4228,6 +4223,11 @@ void SSXLOGCopyFromOldPrimary(XLogReaderState *state, XLogRecPtr pageptr) } } +int XLogFileOpen(XLogSegNo segno) +{ + return XLogFileOpenInternal(segno, SS_XLOGDIR); +} + /* * Open a logfile segment for reading (during recovery). * @@ -4316,7 +4316,7 @@ static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, * Open a logfile segment for reading (during recovery). * This version searches for the segment with any TLI listed in expectedTLIs. */ -static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources) +int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources) { char path[MAXPGPATH]; ListCell *cell = NULL; @@ -5051,7 +5051,7 @@ static void RemoveXlogFile(const char *segname, XLogRecPtr endptr) * symbolic links pointing to a separate archive directory. */ if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && - InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true)) { + InstallXLogFileSegment(&endLogSegNo, (const char *)path, true, &max_advance, true, SS_XLOGDIR)) { ereport(DEBUG2, (errmsg("recycled transaction log file \"%s\"", segname))); t_thrd.xlog_cxt.CheckpointStats->ckpt_segs_recycled++; /* Needn't recheck that slot on future iterations */ @@ -5173,61 +5173,6 @@ static void CleanupBackupHistory(void) FreeDir(xldir); } -inline static XLogReaderState *ReadNextRecordFromQueue(int emode) -{ - char *errormsg = NULL; - extreme_rto::SPSCBlockingQueue *linequeue = extreme_rto::g_dispatcher->readLine.readPageThd->queue; - XLogReaderState *xlogreader = NULL; - do { - xlogreader = (XLogReaderState *)extreme_rto::SPSCBlockingQueueTake(linequeue); - if (!xlogreader->isDecode) { - XLogRecord *record = (XLogRecord *)xlogreader->readRecordBuf; - GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); - if (!DecodeXLogRecord(xlogreader, record, &errormsg)) { - ereport(emode, - (errmsg("ReadNextRecordFromQueue %X/%X decode error, %s", (uint32)(xlogreader->EndRecPtr >> 32), - (uint32)(xlogreader->EndRecPtr), errormsg))); - - extreme_rto::RedoItem *item = extreme_rto::GetRedoItemPtr(xlogreader); - - extreme_rto::FreeRedoItem(item); - - xlogreader = NULL; - } - CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_5]); - } - - if ((void *)xlogreader == (void *)&(extreme_rto::g_GlobalLsnForwarder.record) || - (void *)xlogreader == (void *)&(extreme_rto::g_cleanupMark.record)) { - extreme_rto::StartupSendFowarder(extreme_rto::GetRedoItemPtr(xlogreader)); - xlogreader = NULL; - } - - RedoInterruptCallBack(); - } while (xlogreader == NULL); - - return xlogreader; -} - -static XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode) -{ - XLogRecord *record = NULL; - XLogReaderState *xlogreader = ReadNextRecordFromQueue(emode); - - if ((void *)xlogreader != (void *)&(extreme_rto::g_redoEndMark.record)) { - *xlogreaderptr = xlogreader; - t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; - t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; - record = (XLogRecord *)xlogreader->readRecordBuf; - } else { - *xlogreaderptr = &extreme_rto::g_redoEndMark.record; - if (t_thrd.startup_cxt.shutdown_requested) { - proc_exit(0); - } - } - return record; -} - /* * Attempt to read an XLOG record. * @@ -5401,339 +5346,7 @@ static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, in } } -int ParallelReadPageInternal(XLogReaderState *state, XLogRecPtr pageptr, int reqLen) -{ - int readLen; - uint32 targetPageOff; - XLogSegNo targetSegNo; - XLogPageHeader hdr; - - Assert((pageptr % XLOG_BLCKSZ) == 0); - - XLByteToSeg(pageptr, targetSegNo); - targetPageOff = (pageptr % XLogSegSize); - - /* check whether we have all the requested data already */ - if (targetSegNo == state->readSegNo && targetPageOff == state->readOff && reqLen < (int)state->readLen) { - return state->readLen; - } - - /* - * First, read the requested data length, but at least a short page header - * so that we can validate it. - */ - readLen = ParallelXLogPageRead(state, pageptr, Max(reqLen, (int)SizeOfXLogShortPHD), state->currRecPtr, - &state->readPageTLI); - if (readLen < 0) { - goto err; - } - - Assert(readLen <= XLOG_BLCKSZ); - - /* Do we have enough data to check the header length? */ - if (readLen <= (int)SizeOfXLogShortPHD) { - goto err; - } - - Assert(readLen >= reqLen); - - hdr = (XLogPageHeader)state->readBuf; - - /* still not enough */ - if (readLen < (int)XLogPageHeaderSize(hdr)) { - readLen = ParallelXLogPageRead(state, pageptr, XLogPageHeaderSize(hdr), state->currRecPtr, &state->readPageTLI); - if (readLen < 0) { - goto err; - } - } - - /* - * Now that we know we have the full header, validate it. - */ - if (!ValidXLogPageHeader(state, pageptr, hdr)) { - goto err; - } - - /* update read state information */ - state->readSegNo = targetSegNo; - state->readOff = targetPageOff; - state->readLen = readLen; - - return readLen; - -err: - XLogReaderInvalReadState(state); - return -1; -} - -XLogRecord *ParallelReadRecord(XLogReaderState *state, XLogRecPtr RecPtr, char **errormsg) -{ - XLogRecord *record = NULL; - XLogRecPtr targetPagePtr; - bool randAccess = false; - uint32 len, total_len; - uint32 targetRecOff; - uint32 pageHeaderSize; - bool gotheader = false; - int readOff; - errno_t errorno = EOK; - - /* - * randAccess indicates whether to verify the previous-record pointer of - * the record we're reading. We only do this if we're reading - * sequentially, which is what we initially assume. - */ - randAccess = false; - - /* reset error state */ - *errormsg = NULL; - state->errormsg_buf[0] = '\0'; - - if (XLByteEQ(RecPtr, InvalidXLogRecPtr)) { - /* No explicit start point; read the record after the one we just read */ - RecPtr = state->EndRecPtr; - - if (XLByteEQ(state->ReadRecPtr, InvalidXLogRecPtr)) - randAccess = true; - - /* - * If at page start, we must skip over the page header using xrecoff check. - */ - if (0 == RecPtr % XLogSegSize) { - XLByteAdvance(RecPtr, SizeOfXLogLongPHD); - } else if (0 == RecPtr % XLOG_BLCKSZ) { - XLByteAdvance(RecPtr, SizeOfXLogShortPHD); - } - } else { - /* - * Caller supplied a position to start at. - * - * In this case, the passed-in record pointer should already be - * pointing to a valid record starting position. - */ - Assert(XRecOffIsValid(RecPtr)); - randAccess = true; - } - - state->currRecPtr = RecPtr; - - targetPagePtr = RecPtr - RecPtr % XLOG_BLCKSZ; - targetRecOff = RecPtr % XLOG_BLCKSZ; - - /* - * Read the page containing the record into state->readBuf. Request - * enough byte to cover the whole record header, or at least the part of - * it that fits on the same page. - */ - readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ)); - if (readOff < 0) { - report_invalid_record(state, "read xlog page failed at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); - goto err; - } - - /* - * ReadPageInternal always returns at least the page header, so we can - * examine it now. - */ - pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); - if (targetRecOff == 0) { - /* - * At page start, so skip over page header. - */ - RecPtr += pageHeaderSize; - targetRecOff = pageHeaderSize; - } else if (targetRecOff < pageHeaderSize) { - report_invalid_record(state, "invalid record offset at %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); - goto err; - } - - if ((((XLogPageHeader)state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize) { - report_invalid_record(state, "contrecord is requested by %X/%X", (uint32)(RecPtr >> 32), (uint32)RecPtr); - goto err; - } - - /* ReadPageInternal has verified the page header */ - Assert((int)pageHeaderSize <= readOff); - - /* - * Read the record length. - * - * NB: Even though we use an XLogRecord pointer here, the whole record - * header might not fit on this page. xl_tot_len is the first field of the - * struct, so it must be on this page (the records are MAXALIGNed), but we - * cannot access any other fields until we've verified that we got the - * whole header. - */ - record = (XLogRecord *)(state->readBuf + RecPtr % XLOG_BLCKSZ); - total_len = record->xl_tot_len; - - /* - * If the whole record header is on this page, validate it immediately. - * Otherwise do just a basic sanity check on xl_tot_len, and validate the - * rest of the header after reading it from the next page. The xl_tot_len - * check is necessary here to ensure that we enter the "Need to reassemble - * record" code path below; otherwise we might fail to apply - * ValidXLogRecordHeader at all. - */ - if (targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord) { - if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) - goto err; - gotheader = true; - } else { - /* more validation should be done here */ - if (total_len < SizeOfXLogRecord || total_len >= XLogRecordMaxSize) { - report_invalid_record(state, "invalid record length at %X/%X: wanted %u, got %u", (uint32)(RecPtr >> 32), - (uint32)RecPtr, (uint32)(SizeOfXLogRecord), - total_len); - goto err; - } - gotheader = false; - } - - /* - * Enlarge readRecordBuf as needed. - */ - if (total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)) { - /* We treat this as a "bogus data" condition */ - report_invalid_record(state, "record length %u at %X/%X too long", total_len, (uint32)(RecPtr >> 32), - (uint32)RecPtr); - goto err; - } - - len = XLOG_BLCKSZ - RecPtr % XLOG_BLCKSZ; - if (total_len > len) { - /* Need to reassemble record */ - char *contdata = NULL; - XLogPageHeader pageHeader; - char *buffer = NULL; - uint32 gotlen; - errno_t errorno = EOK; - - readOff = ParallelReadPageInternal(state, targetPagePtr, XLOG_BLCKSZ); - if (readOff < 0) { - goto err; - } - - /* Copy the first fragment of the record from the first page. */ - errorno = memcpy_s(state->readRecordBuf, len, state->readBuf + RecPtr % XLOG_BLCKSZ, len); - securec_check_c(errorno, "\0", "\0"); - buffer = state->readRecordBuf + len; - gotlen = len; - - do { - /* Calculate pointer to beginning of next page */ - XLByteAdvance(targetPagePtr, XLOG_BLCKSZ); - - /* Wait for the next page to become available */ - readOff = ParallelReadPageInternal(state, targetPagePtr, - Min(total_len - gotlen + SizeOfXLogShortPHD, XLOG_BLCKSZ)); - if (readOff < 0) - goto err; - - Assert((int)SizeOfXLogShortPHD <= readOff); - - /* Check that the continuation on next page looks valid */ - pageHeader = (XLogPageHeader)state->readBuf; - if (!(pageHeader->xlp_info & XLP_FIRST_IS_CONTRECORD)) { - report_invalid_record(state, "there is no contrecord flag at %X/%X", (uint32)(RecPtr >> 32), - (uint32)RecPtr); - goto err; - } - - /* - * Cross-check that xlp_rem_len agrees with how much of the record - * we expect there to be left. - */ - if (pageHeader->xlp_rem_len == 0 || total_len != (pageHeader->xlp_rem_len + gotlen)) { - report_invalid_record(state, "invalid contrecord length %u at %X/%X", pageHeader->xlp_rem_len, - (uint32)(RecPtr >> 32), (uint32)RecPtr); - goto err; - } - - /* Append the continuation from this page to the buffer */ - pageHeaderSize = XLogPageHeaderSize(pageHeader); - if (readOff < (int)pageHeaderSize) - readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize); - - Assert((int)pageHeaderSize <= readOff); - - contdata = (char *)state->readBuf + pageHeaderSize; - len = XLOG_BLCKSZ - pageHeaderSize; - if (pageHeader->xlp_rem_len < len) - len = pageHeader->xlp_rem_len; - - if (readOff < (int)(pageHeaderSize + len)) - readOff = ParallelReadPageInternal(state, targetPagePtr, pageHeaderSize + len); - - errorno = memcpy_s(buffer, total_len - gotlen, (char *)contdata, len); - securec_check_c(errorno, "", ""); - buffer += len; - gotlen += len; - - /* If we just reassembled the record header, validate it. */ - if (!gotheader) { - record = (XLogRecord *)state->readRecordBuf; - if (!ValidXLogRecordHeader(state, RecPtr, state->ReadRecPtr, record, randAccess)) - goto err; - gotheader = true; - } - } while (gotlen < total_len); - - Assert(gotheader); - - record = (XLogRecord *)state->readRecordBuf; - if (!ValidXLogRecord(state, record, RecPtr)) - goto err; - - pageHeaderSize = XLogPageHeaderSize((XLogPageHeader)state->readBuf); - state->ReadRecPtr = RecPtr; - state->EndRecPtr = targetPagePtr; - XLByteAdvance(state->EndRecPtr, (pageHeaderSize + MAXALIGN(pageHeader->xlp_rem_len))); - } else { - /* Wait for the record data to become available */ - readOff = ParallelReadPageInternal(state, targetPagePtr, Min(targetRecOff + total_len, XLOG_BLCKSZ)); - if (readOff < 0) { - goto err; - } - - /* Record does not cross a page boundary */ - if (!ValidXLogRecord(state, record, RecPtr)) - goto err; - - state->EndRecPtr = RecPtr; - XLByteAdvance(state->EndRecPtr, MAXALIGN(total_len)); - - state->ReadRecPtr = RecPtr; - errorno = memcpy_s(state->readRecordBuf, total_len, record, total_len); - securec_check_c(errorno, "\0", "\0"); - record = (XLogRecord *)state->readRecordBuf; - } - - /* - * Special processing if it's an XLOG SWITCH record - */ - if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH) { - /* Pretend it extends to end of segment */ - state->EndRecPtr += XLogSegSize - 1; - state->EndRecPtr -= state->EndRecPtr % XLogSegSize; - } - - return record; -err: - - /* - * Invalidate the read state. We might read from a different source after - * failure. - */ - XLogReaderInvalReadState(state); - - if (state->errormsg_buf[0] != '\0') - *errormsg = state->errormsg_buf; - - return NULL; -} - -static void UpdateMinrecoveryInAchive() +void UpdateMinrecoveryInAchive() { volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; XLogRecPtr newMinRecoveryPoint; @@ -5761,108 +5374,6 @@ static void UpdateMinrecoveryInAchive() (uint32)(t_thrd.xlog_cxt.minRecoveryPoint >> 32), (uint32)(t_thrd.xlog_cxt.minRecoveryPoint)))); } -XLogRecord *XLogParallelReadNextRecord(XLogReaderState *xlogreader) -{ - XLogRecord *record = NULL; - - /* This is the first try to read this page. */ - t_thrd.xlog_cxt.failedSources = 0; - for (;;) { - char *errormsg = NULL; - - record = ParallelReadRecord(xlogreader, InvalidXLogRecPtr, &errormsg); - t_thrd.xlog_cxt.ReadRecPtr = xlogreader->ReadRecPtr; - t_thrd.xlog_cxt.EndRecPtr = xlogreader->EndRecPtr; - g_instance.comm_cxt.predo_cxt.redoPf.read_ptr = t_thrd.xlog_cxt.ReadRecPtr; - - if (record == NULL) { - /* - * We only end up here without a message when XLogPageRead() failed - * - in that case we already logged something. - * In StandbyMode that only happens if we have been triggered, so - * we shouldn't loop anymore in that case. - */ - if (errormsg != NULL) - ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), - (errmsg_internal("%s", errormsg) /* already translated */)); - } - - /* - * Check page TLI is one of the expected values. - */ - else if ((!timeLineInHistory(xlogreader->latestPageTLI, t_thrd.xlog_cxt.expectedTLIs)) && - (!(g_instance.attr.attr_storage.IsRoachStandbyCluster && dummyStandbyMode))) { - char fname[MAXFNAMELEN]; - XLogSegNo targetSegNo; - int32 offset; - errno_t errorno = EOK; - - XLByteToSeg(xlogreader->latestPagePtr, targetSegNo); - offset = xlogreader->latestPagePtr % XLogSegSize; - - errorno = snprintf_s(fname, MAXFNAMELEN, MAXFNAMELEN - 1, "%08X%08X%08X", xlogreader->readPageTLI, - (uint32)((targetSegNo) / XLogSegmentsPerXLogId), - (uint32)((targetSegNo) % XLogSegmentsPerXLogId)); - securec_check_ss(errorno, "", ""); - - ereport(emode_for_corrupt_record(LOG, t_thrd.xlog_cxt.EndRecPtr), - (errmsg("unexpected timeline ID %u in log segment %s, offset %u", xlogreader->latestPageTLI, fname, - offset))); - record = NULL; - } - - if (record != NULL) { - /* Set up lastest valid record */ - latestValidRecord = t_thrd.xlog_cxt.ReadRecPtr; - latestRecordCrc = record->xl_crc; - latestRecordLen = record->xl_tot_len; - ADD_ABNORMAL_POSITION(9); - /* Great, got a record */ - return record; - } else { - /* No valid record available from this source */ - t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; - - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; - } - - /* - * If archive recovery was requested, but we were still doing - * crash recovery, switch to archive recovery and retry using the - * offline archive. We have now replayed all the valid WAL in - * pg_xlog, so we are presumably now consistent. - * - * We require that there's at least some valid WAL present in - * pg_xlog, however (!fetch_ckpt). We could recover using the WAL - * from the archive, even if pg_xlog is completely empty, but we'd - * have no idea how far we'd have to replay to reach consistency. - * So err on the safe side and give up. - */ - if (!t_thrd.xlog_cxt.InArchiveRecovery && t_thrd.xlog_cxt.ArchiveRecoveryRequested) { - t_thrd.xlog_cxt.InArchiveRecovery = true; - if (t_thrd.xlog_cxt.StandbyModeRequested) - t_thrd.xlog_cxt.StandbyMode = true; - /* construct a minrecoverypoint, update LSN */ - UpdateMinrecoveryInAchive(); - /* - * Before we retry, reset lastSourceFailed and currentSource - * so that we will check the archive next. - */ - t_thrd.xlog_cxt.failedSources = 0; - continue; - } - - /* In standby mode, loop back to retry. Otherwise, give up. */ - if (t_thrd.xlog_cxt.StandbyMode && !t_thrd.xlog_cxt.recoveryTriggered && !DoEarlyExit()) - continue; - else - return NULL; - } - } -} - /* * Try to read a timeline's history file. * @@ -6013,7 +5524,7 @@ static bool existsTimeLineHistory(TimeLineID probeTLI) * If there are any, the function changes recovery target TLI to the latest * one and returns 'true'. */ -static bool rescanLatestTimeLine(void) +bool rescanLatestTimeLine(void) { TimeLineID newtarget; @@ -6864,6 +6375,8 @@ void XLOGShmemInit(void) t_thrd.shemem_ptr_cxt.XLogCtl->XLogCacheBlck = g_instance.attr.attr_storage.XLOGbuffers - 1; t_thrd.shemem_ptr_cxt.XLogCtl->SharedRecoveryInProgress = true; t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = false; t_thrd.shemem_ptr_cxt.XLogCtl->SharedHotStandbyActive = false; t_thrd.shemem_ptr_cxt.XLogCtl->WalWriterSleeping = false; t_thrd.shemem_ptr_cxt.XLogCtl->xlogFlushPtrForPerRead = InvalidXLogRecPtr; @@ -7184,7 +6697,7 @@ void BootStrapXLOG(void) /* In SS, the first node to create control file is will be primary */ if (ENABLE_DSS) { - SSWriteReformerControlPages(); + SSInitReformerControlPages(); } /* Now create pg_control */ @@ -7541,7 +7054,7 @@ static void exitArchiveRecovery(TimeLineID endTLI, XLogSegNo endLogSegNo) /* * Now move the segment into place with its final name. */ - if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false)) + if (!InstallXLogFileSegment(&endLogSegNo, (const char *)tmppath, false, NULL, false, SS_XLOGDIR)) ereport(ERROR, (errcode(ERRCODE_CASE_NOT_FOUND), errmsg("InstallXLogFileSegment should not have failed"))); if (XLogArchivingActive()) { errorno = snprintf_s(xlogpath, MAXPGPATH, MAXPGPATH - 1, "%08X%08X%08X", endTLI, @@ -8584,7 +8097,7 @@ void ResourceManagerStop(void) errorno = memcpy_s((_oldXlogReader)->readBuf, XLOG_BLCKSZ, (_xlogreader)->readBuf, \ (_oldXlogReader)->readLen); \ securec_check(errorno, "", ""); \ - if (ENABLE_DSS && ENABLE_DMS) { \ + if (ENABLE_DSS && ENABLE_DMS && (_xlogreader)->preReadBuf != NULL) { \ (_oldXlogReader)->preReadStartPtr = (_xlogreader)->preReadStartPtr; \ errorno = memcpy_s((_oldXlogReader)->preReadBuf, XLogPreReadSize, \ (_xlogreader)->preReadBuf, XLogPreReadSize); \ @@ -8600,7 +8113,7 @@ void ResourceManagerStop(void) static void EndRedoXlog() { if (IsExtremeRtoRunning()) { - extreme_rto::CheckCommittingCsnList(); + ExtremeCheckCommittingCsnList(); } if ((get_real_recovery_parallelism() > 1) && (!parallel_recovery::DispatchPtrIsNull())) { @@ -8635,6 +8148,10 @@ inline void PrintCkpXctlControlFile(XLogRecPtr oldCkpLoc, CheckPoint *oldCkp, XL void CheckForRestartPoint() { + if (SS_IN_ONDEMAND_RECOVERY) { + return; + } + XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; if (XLByteLT(xlogctl->lastCheckPointRecPtr, g_instance.comm_cxt.predo_cxt.newestCheckpointLoc)) { @@ -8983,6 +8500,10 @@ static void XLogMakeUpRemainSegsContent(char *contentBuffer) void XLogCheckRemainSegs() { + if (SS_ONDEMAND_BUILD_DONE && !SS_ONDEMAND_RECOVERY_DONE) { + return; + } + uint32 contentLen = XLogGetRemainContentLen(); pg_crc32c crc; char* contentBuffer = (char *)palloc_huge(CurrentMemoryContext, (contentLen + sizeof(pg_crc32c))); @@ -9124,8 +8645,8 @@ void handleRecoverySusPend(XLogRecPtr lsn) { if (RecoveryIsSuspend()) { if (IsExtremeRedo()) { - extreme_rto::DispatchClosefdMarkToAllRedoWorker(); - extreme_rto::WaitAllReplayWorkerIdle(); + ExtremeDispatchClosefdMarkToAllRedoWorker(); + ExtremeWaitAllReplayWorkerIdle(); } else if (IsParallelRedo()) { if (AmStartupProcess()) { ProcTxnWorkLoad(true); @@ -9280,18 +8801,33 @@ void StartupXLOG(void) * Note: in most control paths, *ControlFile is already valid and we need * not do ReadControlFile() here, but might as well do it to be sure. */ - if (ENABLE_DMS) { - int src_id = g_instance.attr.attr_storage.dms_attr.instance_id; - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - src_id = SSGetPrimaryInstId(); - ereport(LOG, (errmsg("[SS Reform]: Standby:%d promoting, reading control file of original primary:%d", - g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); - } - SSReadControlFile(src_id); - } else { - ReadControlFile(); - } - if (FORCE_FINISH_ENABLED) { + if (ENABLE_DMS && ENABLE_DSS) { + int src_id = INVALID_INSTANCEID; + if (SS_CLUSTER_ONDEMAND_RECOVERY && SS_PRIMARY_MODE) { + if (SS_STANDBY_PROMOTING) { + ereport(FATAL, (errmsg("Do not allow switchover if on-demand recovery is not finish"))); + } + + Assert(g_instance.dms_cxt.SSReformerControl.recoveryInstId != INVALID_INSTANCEID); + src_id = g_instance.dms_cxt.SSReformerControl.recoveryInstId; + ereport(LOG, (errmsg("[on-demand]: On-demand recovery do not finish in last reform, " + "reading control file of original primary:%d", src_id))); + } else { + if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { + src_id = SSGetPrimaryInstId(); + ereport(LOG, (errmsg("[SS Reform]: Standby:%d promoting, reading control file of original primary:%d", + g_instance.attr.attr_storage.dms_attr.instance_id, src_id))); + } else { + src_id = g_instance.attr.attr_storage.dms_attr.instance_id; + } + g_instance.dms_cxt.SSReformerControl.recoveryInstId = src_id; + SSSaveReformerCtrl(); + } + SSReadControlFile(src_id); + } else { + ReadControlFile(); + } + if (FORCE_FINISH_ENABLED) { max_page_flush_lsn = mpfl_read_max_flush_lsn(); /* we can't exit proc here, because init gaussdb will run through here and there must be no LsnInfoFile. */ ereport(LOG, @@ -9484,16 +9020,14 @@ void StartupXLOG(void) securec_check(errorno, "", ""); if (ENABLE_DMS && ENABLE_DSS) { + SSGetRecoveryXlogPath(); + xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); + close_readFile_if_open(); if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - SSGetXlogPath(); - xlogreader = SSXLogReaderAllocate(&SSXLogPageRead, &readprivate, ALIGNOF_BUFFER); - close_readFile_if_open(); // init shared memory set page empty SSCSNLOGShmemClear(); SSCLOGShmemClear(); SSMultiXactShmemClear(); - } else { - xlogreader = SSXLogReaderAllocate(&XLogPageRead, &readprivate, ALIGNOF_BUFFER); } } else { xlogreader = XLogReaderAllocate(&XLogPageRead, &readprivate); @@ -9765,6 +9299,8 @@ void StartupXLOG(void) SetMultiXactIdLimit(FirstMultiXactId, TemplateDbOid); t_thrd.shemem_ptr_cxt.XLogCtl->ckptXid = checkPoint.oldestXid; t_thrd.shemem_ptr_cxt.XLogCtl->IsRecoveryDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone = false; + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone = false; latestCompletedXid = checkPoint.nextXid; TransactionIdRetreat(latestCompletedXid); @@ -9942,6 +9478,17 @@ void StartupXLOG(void) t_thrd.xlog_cxt.InRecovery = false; } + if (SS_PRIMARY_MODE) { + if (ENABLE_ONDEMAND_RECOVERY && t_thrd.xlog_cxt.InRecovery == true) { + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = true; + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_IN_ONDEMAND_BUILD; + } else { + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + } + SSSaveReformerCtrl(); + } + ReadRemainSegsFile(); /* Determine whether it is currently in the switchover of streaming disaster recovery */ checkHadrInSwitchover(); @@ -10245,7 +9792,7 @@ void StartupXLOG(void) if (IsExtremeRedo()) { xlogreader->isPRProcess = true; - record = ReadNextXLogRecord(&xlogreader, LOG); + record = ExtremeReadNextXLogRecord(&xlogreader, LOG); if (record == NULL) { ereport(PANIC, (errmsg("redo starts at %X/%X", (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)t_thrd.xlog_cxt.ReadRecPtr))); @@ -10313,13 +9860,13 @@ void StartupXLOG(void) if (recoveryStopsHere(xlogreader, &recoveryApply)) { reachedStopPoint = true; /* see below */ recoveryContinue = false; - extreme_rto::ExtremeRtoStopHere(); + ExtremeExtremeRtoStopHere(); /* Exit loop if we reached non-inclusive recovery target */ if (!recoveryApply && (t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE || (IS_OBS_DISASTER_RECOVER_MODE && (t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)))) { - extreme_rto::WaitAllRedoWorkerQueueEmpty(); + ExtremeWaitAllRedoWorkerQueueEmpty(); break; } } @@ -10401,7 +9948,7 @@ void StartupXLOG(void) (t_thrd.xlog_cxt.server_mode == PRIMARY_MODE || t_thrd.xlog_cxt.server_mode == NORMAL_MODE || (IS_OBS_DISASTER_RECOVER_MODE && (t_thrd.xlog_cxt.recoveryTarget != RECOVERY_TARGET_TIME_OBS)) || IS_DISASTER_RECOVER_MODE)) { - extreme_rto::WaitAllRedoWorkerQueueEmpty(); + ExtremeWaitAllRedoWorkerQueueEmpty(); break; } @@ -10428,14 +9975,19 @@ void StartupXLOG(void) GetRedoStartTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]); if (xlogreader->isPRProcess && IsExtremeRedo()) { - record = ReadNextXLogRecord(&xlogreader, LOG); + record = ExtremeReadNextXLogRecord(&xlogreader, LOG); } else { xlogreader = newXlogReader; record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false); } CountRedoTime(t_thrd.xlog_cxt.timeCost[TIME_COST_STEP_1]); } while (record != NULL); // end of main redo apply loop - SendRecoveryEndMarkToWorkersAndWaitForFinish(0); + + if (SS_IN_ONDEMAND_RECOVERY) { + OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(0); + } else { + SendRecoveryEndMarkToWorkersAndWaitForFinish(0); + } RecoveryXlogReader(oldXlogReader, xlogreader); if (!(IS_OBS_DISASTER_RECOVER_MODE || IS_DISASTER_RECOVER_MODE)) { @@ -10489,6 +10041,11 @@ void StartupXLOG(void) } else { /* there are no WAL records following the checkpoint */ ereport(LOG, (errmsg("redo is not required"))); + if (SS_IN_ONDEMAND_RECOVERY) { + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false; + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + SSSaveReformerCtrl(); + } } } /* Set undoCountThreshold as a proper value after finish recovery. */ @@ -10768,7 +10325,7 @@ void StartupXLOG(void) t_thrd.xlog_cxt.InRecovery = false; g_instance.roach_cxt.isRoachRestore = false; - if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING) { + if (!SS_STANDBY_FAILOVER && !SS_STANDBY_PROMOTING && !SS_IN_ONDEMAND_RECOVERY) { LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); @@ -10885,12 +10442,14 @@ void StartupXLOG(void) xlogctl->SharedRecoveryInProgress = false; xlogctl->IsRecoveryDone = true; SpinLockRelease(&xlogctl->info_lck); - NotifyGscRecoveryFinished(); - if (ENABLE_INCRE_CKPT) { - RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; - (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); - state->start = state->end; - (void)LWLockRelease(state->recovery_queue_lock); + if (!SS_IN_ONDEMAND_RECOVERY) { + NotifyGscRecoveryFinished(); + if (ENABLE_INCRE_CKPT) { + RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; + (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); + state->start = state->end; + (void)LWLockRelease(state->recovery_queue_lock); + } } } @@ -10899,19 +10458,20 @@ void StartupXLOG(void) g_instance.dms_cxt.SSRecoveryInfo.failover_ckpt_status = ALLOW_CKPT; pg_memory_barrier(); } - ereport(LOG, (errmodule(MOD_DMS), - errmsg("[SS switchover/SS failover] standby promoting: start full checkpoint."))); - - RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); - LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); - t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; - t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); - UpdateControlFile(); - LWLockRelease(ControlFileLock); - SSRecheckBufferPool(); - ereport(LOG, (errmodule(MOD_DMS), - errmsg("[SS switchover/SS failover] standby promoting: finished full checkpoint" - "and update control file"))); + if (!SS_IN_ONDEMAND_RECOVERY) { + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS switchover/SS failover] standby promoting: start full checkpoint."))); + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; + t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + SSRecheckBufferPool(); + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS switchover/SS failover] standby promoting: finished full checkpoint" + "and update control file"))); + } } NextXidAfterReovery = t_thrd.xact_cxt.ShmemVariableCache->nextXid; @@ -10946,6 +10506,44 @@ void StartupXLOG(void) g_instance.dms_cxt.SSClusterState = NODESTATE_STANDBY_PROMOTED; } + if (SS_IN_ONDEMAND_RECOVERY) { + /* We wait at here */ + ereport(LOG, (errmsg("[SS] On-demand redo, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT + ", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d" + ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.", + NextXidAfterReovery, t_thrd.xact_cxt.ShmemVariableCache->startupMaxXid, + t_thrd.xact_cxt.ShmemVariableCache->recentLocalXmin, + t_thrd.xact_cxt.ShmemVariableCache->recentGlobalXmin, PendingPreparedXactsCount, + t_thrd.xact_cxt.ShmemVariableCache->nextCommitSeqNo, + t_thrd.xact_cxt.ShmemVariableCache->cutoff_csn_min))); + OnDemandWaitRedoFinish(); + /* to do the work we skip before */ + XLogCheckInvalidPages(); + XLogCheckRemainSegs(); + + RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT); + LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); + t_thrd.shemem_ptr_cxt.ControlFile->state = DB_IN_PRODUCTION; + t_thrd.shemem_ptr_cxt.ControlFile->time = (pg_time_t)time(NULL); + UpdateControlFile(); + LWLockRelease(ControlFileLock); + SSRecheckBufferPool(); + ereport(LOG, (errmodule(MOD_DMS), + errmsg("[SS][on demand recovery] finished full checkpoint and update control file"))); + + NotifyGscRecoveryFinished(); + if (ENABLE_INCRE_CKPT) { + RecoveryQueueState *state = &g_instance.ckpt_cxt_ctl->ckpt_redo_state; + (void)LWLockAcquire(state->recovery_queue_lock, LW_EXCLUSIVE); + state->start = state->end; + (void)LWLockRelease(state->recovery_queue_lock); + } + /* for other nodes in cluster */ + g_instance.dms_cxt.SSReformerControl.clusterStatus = CLUSTER_NORMAL; + SSSaveReformerCtrl(); + g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery = false; + } + ereport(LOG, (errmsg("redo done, nextXid: " XID_FMT ", startupMaxXid: " XID_FMT ", recentLocalXmin: " XID_FMT ", recentGlobalXmin: %lu, PendingPreparedXacts: %d" ", NextCommitSeqNo: %lu, cutoff_csn_min: %lu.", @@ -12628,7 +12226,7 @@ static void RecoveryRestartPoint(const CheckPoint checkPoint, XLogRecPtr recordR } if (IsExtremeRedo()) { - XLogRecPtr safeCheckPoint = extreme_rto::GetSafeMinCheckPoint(); + XLogRecPtr safeCheckPoint = ExtremeGetSafeMinCheckPoint(); if (XLByteEQ(safeCheckPoint, MAX_XLOG_REC_PTR) || XLByteLT(safeCheckPoint, recordReadRecPtr)) { ereport(WARNING, (errmsg("RecoveryRestartPoint is false at %X/%X,last safe point is %X/%X", (uint32)(recordReadRecPtr >> 32), (uint32)(recordReadRecPtr), @@ -16767,490 +16365,6 @@ bool NewDataIsInBuf(XLogRecPtr expectedRecPtr) return havedata; } -void SwitchToReadXlogFromFile(XLogRecPtr pageptr) -{ - pg_atomic_write_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); - pg_atomic_write_u64(&extreme_rto::g_dispatcher->rtoXlogBufState.expectLsn, InvalidXLogRecPtr); - pg_atomic_write_u32(&(extreme_rto::g_recordbuffer->readWorkerState), extreme_rto::WORKER_STATE_STOPPING); - uint32 workerState = pg_atomic_read_u32(&(extreme_rto::g_recordbuffer->readWorkerState)); - while (workerState != extreme_rto::WORKER_STATE_EXIT && workerState != extreme_rto::WORKER_STATE_STOP) { - RedoInterruptCallBack(); - workerState = pg_atomic_read_u32(&(extreme_rto::g_recordbuffer->readWorkerState)); - } -} - -static inline XLogRecPtr CalcExpectLsn(XLogRecPtr recPtr) -{ - XLogRecPtr expectedRecPtr = recPtr; - if (recPtr % XLogSegSize == 0) { - XLByteAdvance(expectedRecPtr, SizeOfXLogLongPHD); - } else if (recPtr % XLOG_BLCKSZ == 0) { - XLByteAdvance(expectedRecPtr, SizeOfXLogShortPHD); - } - return expectedRecPtr; -} - -bool HasReceivedTrigger() -{ - uint32 trigger = pg_atomic_read_u32(&extreme_rto::g_readManagerTriggerFlag); - if (trigger > 0) { - pg_atomic_write_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.readSource, XLOG_FROM_PG_XLOG); - pg_atomic_write_u32(&(extreme_rto::g_recordbuffer->readWorkerState), extreme_rto::WORKER_STATE_STOPPING); - return true; - } - return false; -} - -// receivedUpto indicate received new datas, but can not read,we should check -bool IsReceivingStatusOk() -{ - WalRcvCtlBlock *walrcb = getCurrentWalRcvCtlBlock(); - uint32 startreadworker = pg_atomic_read_u32(&(extreme_rto::g_recordbuffer->readWorkerState)); - if (startreadworker == extreme_rto::WORKER_STATE_STOP && walrcb == NULL) { - return false; - } - return true; -} - -static bool DoEarlyExit() -{ - if (extreme_rto::g_dispatcher == NULL) { - return false; - } - return extreme_rto::g_dispatcher->recoveryStop; -} - -int ParallelXLogReadWorkBufRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, - XLogRecPtr targetRecPtr, TimeLineID *readTLI) -{ - XLogRecPtr RecPtr = targetPagePtr; - uint32 targetPageOff = targetPagePtr % XLogSegSize; - - XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); - XLByteAdvance(RecPtr, reqLen); - - XLogRecPtr expectedRecPtr = CalcExpectLsn(RecPtr); - uint64 waitXLogCount = 0; - const uint64 pushLsnCount = 2; - - pg_atomic_write_u64(&extreme_rto::g_dispatcher->rtoXlogBufState.expectLsn, expectedRecPtr); - for (;;) { - // Check to see if the trigger file exists. If so, update the gaussdb state file. - if (CheckForStandbyTrigger() -#ifndef ENABLE_MULTIPLE_NODES - && IsDCFReadyOrDisabled() -#endif - ) { - SendPostmasterSignal(PMSIGNAL_UPDATE_NORMAL); - } - - /* - * If we find an invalid record in the WAL streamed from - * master, something is seriously wrong. There's little - * chance that the problem will just go away, but PANIC is - * not good for availability either, especially in hot - * standby mode. Disconnect, and retry from - * archive/pg_xlog again. The WAL in the archive should be - * identical to what was streamed, so it's unlikely that - * it helps, but one can hope... - */ - if (t_thrd.xlog_cxt.failedSources & XLOG_FROM_STREAM) { - pg_atomic_write_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.failSource, XLOG_FROM_STREAM); - SwitchToReadXlogFromFile(targetPagePtr); - return -1; - } - - extreme_rto::ResetRtoXlogReadBuf(targetPagePtr); - /* - * Walreceiver is active, so see if new data has arrived. - * - * We only advance XLogReceiptTime when we obtain fresh - * WAL from walreceiver and observe that we had already - * processed everything before the most recent "chunk" - * that it flushed to disk. In steady state where we are - * keeping up with the incoming data, XLogReceiptTime will - * be updated on each cycle. When we are behind, - * XLogReceiptTime will not advance, so the grace time - * alloted to conflicting queries will decrease. - */ - bool havedata = NewDataIsInBuf(expectedRecPtr); - if (havedata) { - /* just make sure source info is correct... */ - t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; - t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; - waitXLogCount = 0; - if ((targetPagePtr / XLOG_BLCKSZ) != (t_thrd.xlog_cxt.receivedUpto / XLOG_BLCKSZ)) { - t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; - } else { - t_thrd.xlog_cxt.readLen = t_thrd.xlog_cxt.receivedUpto % XLogSegSize - targetPageOff; - } - - /* read from wal writer buffer */ - bool readflag = extreme_rto::XLogPageReadForExtRto(xlogreader, targetPagePtr, t_thrd.xlog_cxt.readLen); - if (readflag) { - *readTLI = t_thrd.xlog_cxt.curFileTLI; - return t_thrd.xlog_cxt.readLen; - } else { - if (!IsReceivingStatusOk()) { - SwitchToReadXlogFromFile(targetPagePtr); - return -1; - } - } - } else { - if (HasReceivedTrigger()) { - return -1; - } - - uint32 waitRedoDone = pg_atomic_read_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.waitRedoDone); - if (waitRedoDone == 1 || DoEarlyExit()) { - SwitchToReadXlogFromFile(targetPagePtr); - return -1; - } - /* - * Wait for more WAL to arrive, or timeout to be reached - */ - WaitLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch, WL_LATCH_SET | WL_TIMEOUT, 1000L); - ResetLatch(&t_thrd.shemem_ptr_cxt.XLogCtl->recoveryWakeupLatch); - extreme_rto::PushToWorkerLsn(waitXLogCount == pushLsnCount); - ++waitXLogCount; - } - - RedoInterruptCallBack(); - } - - return -1; -} - -void WaitReplayFinishAfterReadXlogFileComplete(XLogRecPtr lastValidRecordLsn) -{ - Assert(t_thrd.xlog_cxt.EndRecPtr == lastValidRecordLsn); - XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); - - while (XLByteLT(lastReplayedLsn, lastValidRecordLsn) && !DoEarlyExit()) { - RedoInterruptCallBack(); - const long sleepTime = 100; - pg_usleep(sleepTime); - lastReplayedLsn = GetXLogReplayRecPtr(NULL); - } -} - -int ParallelXLogPageReadFile(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, - TimeLineID *readTLI) -{ - bool randAccess = false; - uint32 targetPageOff; - volatile XLogCtlData *xlogctl = t_thrd.shemem_ptr_cxt.XLogCtl; - XLogRecPtr RecPtr = targetPagePtr; - uint32 ret; -#ifdef USE_ASSERT_CHECKING - XLogSegNo targetSegNo; - - XLByteToSeg(targetPagePtr, targetSegNo); -#endif - targetPageOff = targetPagePtr % XLogSegSize; - - /* - * See if we need to switch to a new segment because the requested record - * is not in the currently open one. - */ - if (t_thrd.xlog_cxt.readFile >= 0 && !XLByteInSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo)) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readSource = 0; - } - - XLByteToSeg(targetPagePtr, t_thrd.xlog_cxt.readSegNo); - XLByteAdvance(RecPtr, reqLen); - -retry: - /* See if we need to retrieve more data */ - if (t_thrd.xlog_cxt.readFile < 0) { - if (t_thrd.xlog_cxt.StandbyMode) { - /* - * In standby mode, wait for the requested record to become - * available, either via restore_command succeeding to restore the - * segment, or via walreceiver having streamed the record. - */ - for (;;) { - RedoInterruptCallBack(); - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - t_thrd.xlog_cxt.readFile = -1; - } - /* Reset curFileTLI if random fetch. */ - if (randAccess) { - t_thrd.xlog_cxt.curFileTLI = 0; - } - - /* - * Try to restore the file from archive, or read an - * existing file from pg_xlog. - */ - uint32 sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG; - if (!(sources & ~t_thrd.xlog_cxt.failedSources)) { - /* - * We've exhausted all options for retrieving the - * file. Retry. - */ - t_thrd.xlog_cxt.failedSources = 0; - - /* - * Before we sleep, re-scan for possible new timelines - * if we were requested to recover to the latest - * timeline. - */ - if (t_thrd.xlog_cxt.recoveryTargetIsLatest) { - if (rescanLatestTimeLine()) { - continue; - } - } - - extreme_rto::PushToWorkerLsn(true); - WaitReplayFinishAfterReadXlogFileComplete(t_thrd.xlog_cxt.EndRecPtr); - - if (!xlogctl->IsRecoveryDone) { - g_instance.comm_cxt.predo_cxt.redoPf.redo_done_time = GetCurrentTimestamp(); - g_instance.comm_cxt.predo_cxt.redoPf.recovery_done_ptr = t_thrd.xlog_cxt.ReadRecPtr; - } - - XLogRecPtr lastReplayedLsn = GetXLogReplayRecPtr(NULL); - ereport(LOG, - (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("ParallelXLogPageReadFile IsRecoveryDone is %s set true," - "ReadRecPtr:%X/%X, EndRecPtr:%X/%X, lastreplayed:%X/%X", - xlogctl->IsRecoveryDone ? "next" : "first", - (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.ReadRecPtr), - (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), (uint32)(t_thrd.xlog_cxt.EndRecPtr), - (uint32)(lastReplayedLsn >> 32), (uint32)(lastReplayedLsn)))); - - /* - * signal postmaster to update local redo end - * point to gaussdb state file. - */ - if (!xlogctl->IsRecoveryDone) { - SendPostmasterSignal(PMSIGNAL_LOCAL_RECOVERY_DONE); - } - - SpinLockAcquire(&xlogctl->info_lck); - xlogctl->IsRecoveryDone = true; - SpinLockRelease(&xlogctl->info_lck); - if (!(IS_SHARED_STORAGE_MODE) || - pg_atomic_read_u32(&t_thrd.walreceiverfuncs_cxt.WalRcv->rcvDoneFromShareStorage)) { - knl_g_set_redo_finish_status(REDO_FINISH_STATUS_LOCAL | REDO_FINISH_STATUS_CM); - ereport(LOG, - (errmodule(MOD_REDO), errcode(ERRCODE_LOG), - errmsg("ParallelXLogPageReadFile set redo finish status," - "ReadRecPtr:%X/%X, EndRecPtr:%X/%X", - (uint32)(t_thrd.xlog_cxt.ReadRecPtr >> 32), - (uint32)(t_thrd.xlog_cxt.ReadRecPtr), (uint32)(t_thrd.xlog_cxt.EndRecPtr >> 32), - (uint32)(t_thrd.xlog_cxt.EndRecPtr)))); - - /* - * If it hasn't been long since last attempt, sleep 1s to - * avoid busy-waiting. - */ - pg_usleep(150000L); - } - /* - * If primary_conninfo is set, launch walreceiver to - * try to stream the missing WAL, before retrying to - * restore from archive/pg_xlog. - * - * If fetching_ckpt is TRUE, RecPtr points to the - * initial checkpoint location. In that case, we use - * RedoStartLSN as the streaming start position - * instead of RecPtr, so that when we later jump - * backwards to start redo at RedoStartLSN, we will - * have the logs streamed already. - */ - - uint32 trigger = pg_atomic_read_u32(&extreme_rto::g_readManagerTriggerFlag); - if (trigger > 0) { - pg_atomic_write_u32(&extreme_rto::g_readManagerTriggerFlag, extreme_rto::TRIGGER_NORMAL); - goto triggered; - } - - load_server_mode(); - if (t_thrd.xlog_cxt.PrimaryConnInfo || t_thrd.xlog_cxt.server_mode == STANDBY_MODE) { - t_thrd.xlog_cxt.receivedUpto = 0; - uint32 failSouce = pg_atomic_read_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.failSource); - - if (!(failSouce & XLOG_FROM_STREAM)) { - volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv; - SpinLockAcquire(&walrcv->mutex); - walrcv->receivedUpto = 0; - SpinLockRelease(&walrcv->mutex); - t_thrd.xlog_cxt.readSource = XLOG_FROM_STREAM; - t_thrd.xlog_cxt.XLogReceiptSource = XLOG_FROM_STREAM; - pg_atomic_write_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.readSource, - XLOG_FROM_STREAM); - pg_atomic_write_u32(&extreme_rto::g_dispatcher->rtoXlogBufState.waitRedoDone, 0); - return -1; - } - } - } - /* Don't try to read from a source that just failed */ - sources &= ~t_thrd.xlog_cxt.failedSources; - t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, DEBUG2, sources); - if (t_thrd.xlog_cxt.readFile >= 0) { - break; - } - /* - * Nope, not found in archive and/or pg_xlog.: - */ - t_thrd.xlog_cxt.failedSources |= sources; - - /* - * Check to see if the trigger file exists. Note that we - * do this only after failure, so when you create the - * trigger file, we still finish replaying as much as we - * can from archive and pg_xlog before failover. - */ - uint32 trigger = pg_atomic_read_u32(&extreme_rto::g_readManagerTriggerFlag); - if (trigger > 0) { - pg_atomic_write_u32(&extreme_rto::g_readManagerTriggerFlag, extreme_rto::TRIGGER_NORMAL); - goto triggered; - } - } - } else { - /* In archive or crash recovery. */ - if (t_thrd.xlog_cxt.readFile < 0) { - uint32 sources; - - /* Reset curFileTLI if random fetch. */ - if (randAccess) { - t_thrd.xlog_cxt.curFileTLI = 0; - } - - sources = XLOG_FROM_PG_XLOG; - if (t_thrd.xlog_cxt.InArchiveRecovery) { - sources |= XLOG_FROM_ARCHIVE; - } - - t_thrd.xlog_cxt.readFile = XLogFileReadAnyTLI(t_thrd.xlog_cxt.readSegNo, LOG, sources); - - if (t_thrd.xlog_cxt.readFile < 0) { - return -1; - } - } - } - } - - /* - * At this point, we have the right segment open and if we're streaming we - * know the requested record is in it. - */ - Assert(t_thrd.xlog_cxt.readFile != -1); - - /* - * If the current segment is being streamed from master, calculate how - * much of the current page we have received already. We know the - * requested record has been received, but this is for the benefit of - * future calls, to allow quick exit at the top of this function. - */ - t_thrd.xlog_cxt.readLen = XLOG_BLCKSZ; - - /* Read the requested page */ - t_thrd.xlog_cxt.readOff = targetPageOff; - -try_again: - if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { - ereport(emode_for_corrupt_record(LOG, RecPtr), - (errcode_for_file_access(), - errmsg("could not seek in log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; - } - pgstat_report_waitevent(WAIT_EVENT_WAL_READ); - ret = read(t_thrd.xlog_cxt.readFile, xlogreader->readBuf, XLOG_BLCKSZ); - pgstat_report_waitevent(WAIT_EVENT_END); - if (ret != XLOG_BLCKSZ) { - ereport(emode_for_corrupt_record(LOG, RecPtr), - (errcode_for_file_access(), - errmsg("could not read from log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; - } - Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); - Assert(targetPageOff == t_thrd.xlog_cxt.readOff); - Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); - - *readTLI = t_thrd.xlog_cxt.curFileTLI; - - return t_thrd.xlog_cxt.readLen; - -next_record_is_invalid: - t_thrd.xlog_cxt.failedSources |= t_thrd.xlog_cxt.readSource; - - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - } - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readLen = 0; - t_thrd.xlog_cxt.readSource = 0; - - /* In standby-mode, keep trying */ - if (t_thrd.xlog_cxt.StandbyMode) { - goto retry; - } else { - return -1; - } - -triggered: - if (t_thrd.xlog_cxt.readFile >= 0) { - close(t_thrd.xlog_cxt.readFile); - } - t_thrd.xlog_cxt.readFile = -1; - t_thrd.xlog_cxt.readLen = 0; - t_thrd.xlog_cxt.readSource = 0; - t_thrd.xlog_cxt.recoveryTriggered = true; - - return -1; -} - -int ParallelXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, XLogRecPtr targetRecPtr, - TimeLineID *readTLI) -{ - int readLen = -1; - pg_atomic_write_u64(&extreme_rto::g_dispatcher->rtoXlogBufState.targetRecPtr, targetRecPtr); - xlogreader->readBuf = extreme_rto::g_dispatcher->rtoXlogBufState.readBuf; - - for (;;) { - uint32 readSource = pg_atomic_read_u32(&(extreme_rto::g_recordbuffer->readSource)); - if (readSource & XLOG_FROM_STREAM) { - readLen = ParallelXLogReadWorkBufRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); - } else { - if (SS_STANDBY_FAILOVER || SS_STANDBY_PROMOTING) { - readLen = SSXLogPageRead(xlogreader, targetPagePtr, reqLen, targetRecPtr, - xlogreader->readBuf, readTLI, NULL); - } else { - readLen = ParallelXLogPageReadFile(xlogreader, targetPagePtr, reqLen, targetRecPtr, readTLI); - } - } - - if (readLen > 0 || t_thrd.xlog_cxt.recoveryTriggered || !t_thrd.xlog_cxt.StandbyMode || DoEarlyExit()) { - return readLen; - } - - RedoInterruptCallBack(); - ADD_ABNORMAL_POSITION(10); - } - - return readLen; -} - static ReplConnTarget GetRepConntarget(void) { if (t_thrd.xlog_cxt.is_cascade_standby) { @@ -17959,49 +17073,36 @@ retry: /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; - if (ENABLE_DSS && ENABLE_DMS) { - bool ss_ret = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, readBuf); - if (!ss_ret) { - ereport(emode_for_corrupt_record(emode, RecPtr), +try_again: + if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { + ereport(emode_for_corrupt_record(emode, RecPtr), (errcode_for_file_access(), - errmsg("[SS] could not read from log file %s to offset %u: %m", + errmsg("could not seek in log file %s to offset %u: %m", XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), t_thrd.xlog_cxt.readOff))); - goto next_record_is_invalid; - } - } else { -try_again: - if (lseek(t_thrd.xlog_cxt.readFile, (off_t)t_thrd.xlog_cxt.readOff, SEEK_SET) < 0) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not seek in log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; } - pgstat_report_waitevent(WAIT_EVENT_WAL_READ); - ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); - pgstat_report_waitevent(WAIT_EVENT_END); - if (ret != XLOG_BLCKSZ) { - ereport(emode_for_corrupt_record(emode, RecPtr), - (errcode_for_file_access(), - errmsg("could not read from log file %s to offset %u: %m", - XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), - t_thrd.xlog_cxt.readOff))); - if (errno == EINTR) { - errno = 0; - pg_usleep(1000); - goto try_again; - } - goto next_record_is_invalid; + goto next_record_is_invalid; + } + pgstat_report_waitevent(WAIT_EVENT_WAL_READ); + ret = read(t_thrd.xlog_cxt.readFile, readBuf, XLOG_BLCKSZ); + pgstat_report_waitevent(WAIT_EVENT_END); + if (ret != XLOG_BLCKSZ) { + ereport(emode_for_corrupt_record(emode, RecPtr), + (errcode_for_file_access(), + errmsg("could not read from log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.xlog_cxt.readSegNo), + t_thrd.xlog_cxt.readOff))); + if (errno == EINTR) { + errno = 0; + pg_usleep(1000); + goto try_again; } + goto next_record_is_invalid; } - Assert(targetSegNo == t_thrd.xlog_cxt.readSegNo); Assert(targetPageOff == t_thrd.xlog_cxt.readOff); Assert((uint32)reqLen <= t_thrd.xlog_cxt.readLen); @@ -18057,7 +17158,7 @@ triggered: * you are about to ereport(), or you might cause a later message to be * erroneously suppressed. */ -static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) +int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { if (t_thrd.xlog_cxt.readSource == XLOG_FROM_PG_XLOG && emode == LOG) { if (XLByteEQ(RecPtr, t_thrd.xlog_cxt.lastComplaint)) { @@ -18212,10 +17313,9 @@ static bool CheckForPrimaryTrigger(void) return false; } else { /* check for primary */ - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == extreme_rto::TRIGGER_PRIMARY) { - (void)pg_atomic_compare_exchange_u32(&(extreme_rto::g_startupTriggerState), &tgigger, - extreme_rto::TRIGGER_NORMAL); + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == TRIGGER_PRIMARY) { + (void)pg_atomic_compare_exchange_u32(&g_startupTriggerState, &tgigger, TRIGGER_NORMAL); return true; } } @@ -18223,7 +17323,7 @@ static bool CheckForPrimaryTrigger(void) return false; } -static bool CheckForStandbyTrigger(void) +bool CheckForStandbyTrigger(void) { if (AmStartupProcess()) { if (IsStandbyTriggered()) { @@ -18235,10 +17335,9 @@ static bool CheckForStandbyTrigger(void) return false; } else { /* check for primary */ - uint32 tgigger = pg_atomic_read_u32(&(extreme_rto::g_startupTriggerState)); - if (tgigger == extreme_rto::TRIGGER_STADNBY) { - (void)pg_atomic_compare_exchange_u32(&(extreme_rto::g_startupTriggerState), &tgigger, - extreme_rto::TRIGGER_NORMAL); + uint32 tgigger = pg_atomic_read_u32(&g_startupTriggerState); + if (tgigger == TRIGGER_STADNBY) { + (void)pg_atomic_compare_exchange_u32(&(g_startupTriggerState), &tgigger, TRIGGER_NORMAL); return true; } } @@ -18270,29 +17369,29 @@ bool CheckFinishRedoSignal(void) return is_finish_redo; } -extreme_rto::Enum_TriggeredState CheckForSatartupStatus(void) +Enum_TriggeredState CheckForSatartupStatus(void) { if (t_thrd.startup_cxt.primary_triggered) { ereport(LOG, (errmsg("received primary request"))); ResetPrimaryTriggered(); - return extreme_rto::TRIGGER_PRIMARY; + return TRIGGER_PRIMARY; } if (t_thrd.startup_cxt.standby_triggered) { ereport(LOG, (errmsg("received standby request"))); ResetStandbyTriggered(); - return extreme_rto::TRIGGER_STADNBY; + return TRIGGER_STADNBY; } if (t_thrd.startup_cxt.failover_triggered) { ereport(LOG, (errmsg("received failover request"))); ResetFailoverTriggered(); - return extreme_rto::TRIGGER_FAILOVER; + return TRIGGER_FAILOVER; } if (t_thrd.startup_cxt.switchover_triggered) { ereport(LOG, (errmsg("received switchover request"))); ResetSwitchoverTriggered(); - return extreme_rto::TRIGGER_FAILOVER; + return TRIGGER_FAILOVER; } - return extreme_rto::TRIGGER_NORMAL; + return TRIGGER_NORMAL; } /* @@ -18808,7 +17907,7 @@ void heap_xlog_logical_new_page(XLogReaderState *record) /* * Returns true if 'expectedTLEs' contains a timeline with id 'tli' */ -static bool timeLineInHistory(TimeLineID tli, List *expectedTLEs) +bool timeLineInHistory(TimeLineID tli, List *expectedTLEs) { ListCell *cell = NULL; @@ -19830,6 +18929,78 @@ bool SSModifySharedLunAllowed() return false; } +static int SSOndemandCopyXLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) +{ + return XLogFileInitInternal(logsegno, use_existent, use_lock, SS_XLOGRECOVERYDIR); +} + +static int SSOndemandCopyXlogFileOpen(XLogSegNo segno) +{ + return XLogFileOpenInternal(segno, SS_XLOGRECOVERYDIR); +} + +static void SSOndemandCopyXlogFileClose(void) +{ + Assert(t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0); + + if (close(t_thrd.ondemand_xlog_copy_cxt.openLogFile)) { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not close copy log file %s: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo)))); + } + + t_thrd.ondemand_xlog_copy_cxt.openLogFile = -1; +} + +static void SSOndemandXlogCopy(XLogSegNo copySegNo, uint32 startOffset, char *copyBuffer, Size copyBytes) +{ + // only copy when recovery node and reformer node is not same + if (!SS_IN_ONDEMAND_RECOVERY || SS_OFFICIAL_RECOVERY_NODE) { + return; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogSegNo != copySegNo) { + if (t_thrd.ondemand_xlog_copy_cxt.openLogFile >= 0) { + SSOndemandCopyXlogFileClose(); + } + t_thrd.ondemand_xlog_copy_cxt.openLogSegNo = copySegNo; + + bool use_existent = true; + t_thrd.ondemand_xlog_copy_cxt.openLogFile = + SSOndemandCopyXLogFileInit(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo, &use_existent, true); + t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogFile <= 0) { + t_thrd.ondemand_xlog_copy_cxt.openLogFile = + SSOndemandCopyXlogFileOpen(t_thrd.ondemand_xlog_copy_cxt.openLogSegNo); + t_thrd.ondemand_xlog_copy_cxt.openLogOff = 0; + } + + if (t_thrd.ondemand_xlog_copy_cxt.openLogOff != startOffset) { + if (lseek(t_thrd.ondemand_xlog_copy_cxt.openLogFile, (off_t)startOffset, SEEK_SET) < 0) { + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not seek in log file %s to offset %u: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo), + startOffset))); + } + t_thrd.ondemand_xlog_copy_cxt.openLogOff = startOffset; + } + + Size actualBytes = write(t_thrd.ondemand_xlog_copy_cxt.openLogFile, copyBuffer, copyBytes); + if (actualBytes != copyBytes) { + /* if write didn't set errno, assume no disk space */ + if (errno == 0) { + errno = ENOSPC; + } + ereport(PANIC, (errcode_for_file_access(), + errmsg("could not write to log file %s at offset %u, length %lu: %m", + XLogFileNameP(t_thrd.xlog_cxt.ThisTimeLineID, t_thrd.ondemand_xlog_copy_cxt.openLogSegNo), + t_thrd.ondemand_xlog_copy_cxt.openLogOff, (unsigned long)copyBytes))); + } + t_thrd.ondemand_xlog_copy_cxt.openLogOff += copyBytes; +} + static int SSReadXLog(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int expectReadLen, XLogRecPtr targetRecPtr, char *buf, TimeLineID *readTLI, char* xlog_path) { @@ -20218,8 +19389,14 @@ retry: /* Read the requested page */ t_thrd.xlog_cxt.readOff = targetPageOff; - bool ret = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, buf); - if (!ret) { + int actualBytes; + if (xlogreader->preReadBuf != NULL) { + actualBytes = SSReadXlogInternal(xlogreader, targetPagePtr, targetRecPtr, buf, XLOG_BLCKSZ); + } else { + actualBytes = (int)pread(t_thrd.xlog_cxt.readFile, buf, XLOG_BLCKSZ, t_thrd.xlog_cxt.readOff); + } + + if (actualBytes != XLOG_BLCKSZ) { ereport(LOG, (errcode_for_file_access(), errmsg("read xlog(start:%X/%X, pos:%u len:%d) failed : %m", static_cast(targetPagePtr >> BIT_NUM_INT32), static_cast(targetPagePtr), targetPageOff, diff --git a/src/gausskernel/storage/access/transam/xlogreader.cpp b/src/gausskernel/storage/access/transam/xlogreader.cpp index b79ba4b8c55c1612dbea70f21a1fa96bc707eb02..73729d16e6efef476659d75a2f58ccfcde726166 100644 --- a/src/gausskernel/storage/access/transam/xlogreader.cpp +++ b/src/gausskernel/storage/access/transam/xlogreader.cpp @@ -105,6 +105,7 @@ XLogReaderState *XLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_d state->max_block_id = -1; state->isPRProcess = false; + state->preReadBuf = NULL; /* * Permanently allocate readBuf. We do it this way, rather than just diff --git a/src/gausskernel/storage/access/transam/xlogutils.cpp b/src/gausskernel/storage/access/transam/xlogutils.cpp index 6692dca39d322a835b02a2668e11d2e14dbeea59..211d34d504b9add939b7f3f7213ce1cd7d6da89a 100644 --- a/src/gausskernel/storage/access/transam/xlogutils.cpp +++ b/src/gausskernel/storage/access/transam/xlogutils.cpp @@ -30,7 +30,6 @@ #include "access/xlogproc.h" #include "access/multi_redo_api.h" #include "access/parallel_recovery/dispatcher.h" -#include "access/extreme_rto/page_redo.h" #include "catalog/catalog.h" #include "catalog/storage_xlog.h" #include "miscadmin.h" @@ -514,6 +513,10 @@ static void CollectInvalidPagesStates(uint32 *nstates_ptr, InvalidPagesState *** /* Complain about any remaining invalid-page entries */ void XLogCheckInvalidPages(void) { + if (SS_ONDEMAND_BUILD_DONE && !SS_ONDEMAND_RECOVERY_DONE) { + return; + } + bool foundone = false; if (t_thrd.xlog_cxt.forceFinishHappened) { ereport(WARNING, @@ -672,7 +675,7 @@ XLogRedoAction XLogReadBufferForRedoBlockExtend(RedoBufferTag *redoblock, ReadBu if (pageisvalid) { if (readmethod != WITH_LOCAL_CACHE) { if (mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK) { - if (ENABLE_DMS) + if (ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY) LockBuffer(buf, BUFFER_LOCK_SHARE); else if (get_cleanup_lock) LockBufferForCleanup(buf); @@ -699,7 +702,7 @@ XLogRedoAction XLogReadBufferForRedoBlockExtend(RedoBufferTag *redoblock, ReadBu return BLK_DONE; } else { if (readmethod != WITH_LOCAL_CACHE && mode != RBM_ZERO_AND_LOCK && mode != RBM_ZERO_AND_CLEANUP_LOCK && - ENABLE_DMS) { + ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY) { Assert(!CheckPageNeedSkipInRecovery(buf)); LockBuffer(buf, BUFFER_LOCK_UNLOCK); if (get_cleanup_lock) { @@ -1438,7 +1441,7 @@ void XLogDropRelation(const RelFileNode &rnode, ForkNumber forknum) /* clear relfilenode match entry of recovery thread hashtbl */ if (IsExtremeRedo()) { - extreme_rto::ClearRecoveryThreadHashTbl(rnode, forknum, 0, false); + ExtremeClearRecoveryThreadHashTbl(rnode, forknum, 0, false); } else { parallel_recovery::ClearRecoveryThreadHashTbl(rnode, forknum, 0, false); } @@ -1513,7 +1516,7 @@ void XLogDropDatabase(Oid dbid) /* clear dbNode match entry of recovery thread hashtbl */ if (IsExtremeRedo()) { - extreme_rto::BatchClearRecoveryThreadHashTbl(InvalidOid, dbid); + ExtremeBatchClearRecoveryThreadHashTbl(InvalidOid, dbid); } else { parallel_recovery::BatchClearRecoveryThreadHashTbl(InvalidOid, dbid); } @@ -1532,7 +1535,7 @@ void XLogDropSegmentSpace(Oid spcNode, Oid dbNode) /* clear spcNode and dbNode match entry of recovery thread hashtbl */ if (IsExtremeRedo()) { - extreme_rto::BatchClearRecoveryThreadHashTbl(spcNode, dbNode); + ExtremeBatchClearRecoveryThreadHashTbl(spcNode, dbNode); } else { parallel_recovery::BatchClearRecoveryThreadHashTbl(spcNode, dbNode); } @@ -1554,7 +1557,7 @@ void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, BlockNumber nbl /* clear relfilenode match entry of recovery thread hashtbl */ if (g_instance.pid_cxt.PageRepairPID != 0) { if (IsExtremeRedo()) { - extreme_rto::ClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, false); + ExtremeClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, false); } else { parallel_recovery::ClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, false); } @@ -1570,7 +1573,7 @@ void XLogTruncateSegmentSpace(RelFileNode rnode, ForkNumber forkNum, BlockNumber /* clear relfilenode match entry of recovery thread hashtbl */ if (g_instance.pid_cxt.PageRepairPID != 0) { if (IsExtremeRedo()) { - extreme_rto::ClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, true); + ExtremeClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, true); } else { parallel_recovery::ClearRecoveryThreadHashTbl(rnode, forkNum, nblocks, true); } diff --git a/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp b/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp index 3a633426804f887808f537f874505c1b9cb01c6d..6e70f321f1a729b17d8bf7fea1dade541ed2e000 100644 --- a/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp +++ b/src/gausskernel/storage/access/ustore/knl_uextremeredo.cpp @@ -31,7 +31,6 @@ #include "access/xlogutils.h" #include "catalog/pg_tablespace.h" #include "storage/standby.h" -#include "access/extreme_rto/page_redo.h" #include "access/ustore/knl_uredo.h" #include "access/ustore/knl_uextremeredo.h" #include "access/ustore/knl_upage.h" @@ -108,7 +107,7 @@ static XLogRecParseState *UHeapXlogInsertParseBlock(XLogReaderState *record, uin return NULL; } - if (!extreme_rto::RedoWorkerIsUndoSpaceWorker()) { + if (!ExtremeRedoWorkerIsUndoSpaceWorker()) { XLogRecSetBlockDataState(record, UHEAP_INSERT_ORIG_BLOCK_NUM, recordstatehead); } else { XLogRecSetUHeapUndoBlockState(record, UHEAP_UNDO_ORIG_BLOCK_NUM, recordstatehead); @@ -127,7 +126,7 @@ static XLogRecParseState *UHeapXlogDeleteParseBlock(XLogReaderState *record, uin return NULL; } - if (!extreme_rto::RedoWorkerIsUndoSpaceWorker()) { + if (!ExtremeRedoWorkerIsUndoSpaceWorker()) { XLogRecSetBlockDataState(record, UHEAP_DELETE_ORIG_BLOCK_NUM, recordstatehead); } else { XLogRecSetUHeapUndoBlockState(record, UHEAP_UNDO_ORIG_BLOCK_NUM, recordstatehead); @@ -154,7 +153,7 @@ static XLogRecParseState *UHeapXlogUpdateParseBlock(XLogReaderState *record, uin return NULL; } - if (!extreme_rto::RedoWorkerIsUndoSpaceWorker()) { + if (!ExtremeRedoWorkerIsUndoSpaceWorker()) { XLogRecSetBlockDataState(record, UHEAP_UPDATE_NEW_BLOCK_NUM, recordstatehead); XLogRecSetAuxiBlkNumState(&recordstatehead->blockparse.extra_rec.blockdatarec, oldblk, InvalidForkNumber); if (oldblk != newblk) { @@ -185,7 +184,7 @@ static XLogRecParseState *UHeapXlogMultiInsertParseBlock(XLogReaderState *record return NULL; } - if (!extreme_rto::RedoWorkerIsUndoSpaceWorker()) { + if (!ExtremeRedoWorkerIsUndoSpaceWorker()) { XLogRecSetBlockDataState(record, UHEAP_MULTI_INSERT_ORIG_BLOCK_NUM, recordstatehead); } else { XLogRecSetUHeapUndoBlockState(record, UHEAP_UNDO_ORIG_BLOCK_NUM, recordstatehead); diff --git a/src/gausskernel/storage/buffer/bufmgr.cpp b/src/gausskernel/storage/buffer/bufmgr.cpp index 14c7a6011b7535dff642f3221cb5bdd2ba6b3a6c..8e79236af711c0ec7e26450202fa4179e59f8163 100644 --- a/src/gausskernel/storage/buffer/bufmgr.cpp +++ b/src/gausskernel/storage/buffer/bufmgr.cpp @@ -84,6 +84,7 @@ #include "tde_key_management/tde_key_storage.h" #include "ddes/dms/ss_dms_bufmgr.h" #include "ddes/dms/ss_common_attr.h" +#include "ddes/dms/ss_reform_common.h" #include "ddes/dms/ss_transaction.h" const int ONE_MILLISECOND = 1; @@ -2155,7 +2156,8 @@ Buffer ReadBuffer_common_for_dms(ReadBufferMode readmode, BufferDesc* buf_desc, Block bufBlock = BufHdrGetBlock(buf_desc); #ifdef USE_ASSERT_CHECKING - bool need_verify = (!RecoveryInProgress() && ((pg_atomic_read_u32(&buf_desc->state) & BM_VALID) != 0) && ENABLE_VERIFY_PAGE_VERSION); + bool need_verify = (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && + ((pg_atomic_read_u32(&buf_desc->state) & BM_VALID) != 0) && ENABLE_VERIFY_PAGE_VERSION); char *past_image = NULL; if (need_verify) { past_image = (char *)palloc(BLCKSZ); @@ -2346,8 +2348,11 @@ found_branch: } LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); } + + if (t_thrd.role != PAGEREDO && SS_ONDEMAND_BUILD_DONE && SS_PRIMARY_MODE) { + bufHdr = RedoForOndemandExtremeRTOQuery(bufHdr, relpersistence, forkNum, blockNum, mode); + } } - return BufferDescriptorGetBuffer(bufHdr); } @@ -2409,6 +2414,18 @@ found_branch: /* DMS: Try get page remote */ if (ENABLE_DMS) { + // standby node must notify primary node for prepare lastest page in ondemand recovery + if (SS_STANDBY_ONDEMAND_RECOVERY) { + while (!SSOndemandRequestPrimaryRedo(bufHdr->tag)) { + SSReadControlFile(REFORM_CTRL_PAGE); + if (SS_STANDBY_ONDEMAND_NORMAL) { + break; // ondemand recovery finish, skip + } else if (SS_STANDBY_ONDEMAND_BUILD) { + return 0; // in new reform + } + // still need requset page + } + } MarkReadHint(bufHdr->buf_id, relpersistence, isExtend, pblk); if (mode != RBM_FOR_REMOTE && relpersistence != RELPERSISTENCE_TEMP && !isLocalBuf) { Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); diff --git a/src/gausskernel/storage/lmgr/lwlock.cpp b/src/gausskernel/storage/lmgr/lwlock.cpp index 53e00458f1f507e6c162e056dde2368b4c279e67..d00e31e8410859e23f02982679c617200ec60eb9 100644 --- a/src/gausskernel/storage/lmgr/lwlock.cpp +++ b/src/gausskernel/storage/lmgr/lwlock.cpp @@ -196,7 +196,8 @@ static const char *BuiltinTrancheNames[] = { "FileRepairHashTblLock", "ReplicationOriginLock", "AuditIndextblLock", - "PCABufferContentLock" + "PCABufferContentLock", + "XlogTrackPartLock" }; static void RegisterLWLockTranches(void); @@ -436,6 +437,9 @@ int NumLWLocks(void) /* for barrier preparse hashtbl */ numLocks += 1; + /* for xlog track hash table */ + numLocks += NUM_XLOG_TRACK_PARTITIONS; + /* * Add any requested by loadable modules; for backwards-compatibility * reasons, allocate at least NUM_USER_DEFINED_LWLOCKS of them even if @@ -646,6 +650,10 @@ static void InitializeLWLocks(int numLocks) LWLockInitialize(&lock->lock, LWTRANCHE_STANDBY_STMTHIST); } + for (id = 0; id < NUM_XLOG_TRACK_PARTITIONS; id++, lock++) { + LWLockInitialize(&lock->lock, LWTRANCHE_XLOG_TRACK_PARTITION); + } + Assert((lock - t_thrd.shemem_ptr_cxt.mainLWLockArray) == NumFixedLWLocks); for (id = NumFixedLWLocks; id < numLocks; id++, lock++) { diff --git a/src/gausskernel/storage/lmgr/lwlocknames.txt b/src/gausskernel/storage/lmgr/lwlocknames.txt index c6daaab09fd9976a3c3075d44d8bbefcea093600..3a3fe6b7551da9fe097c3f72e15f0bfbbd5f204b 100755 --- a/src/gausskernel/storage/lmgr/lwlocknames.txt +++ b/src/gausskernel/storage/lmgr/lwlocknames.txt @@ -138,3 +138,4 @@ GsStackLock 128 ConfigFileLock 129 DropArchiveSlotLock 130 AboCacheLock 131 +OndemandXlogMemAllocLock 132 diff --git a/src/gausskernel/storage/lmgr/proc.cpp b/src/gausskernel/storage/lmgr/proc.cpp index 8611e39ed5588d3196a15f3547cf37e86e662c8d..1aae1a251d0ae1d1366b4a2a0b99100ed7ec8485 100755 --- a/src/gausskernel/storage/lmgr/proc.cpp +++ b/src/gausskernel/storage/lmgr/proc.cpp @@ -2448,7 +2448,7 @@ void ProcSendSignal(ThreadId pid) { PGPROC* proc = NULL; - if (RecoveryInProgress()) { + if (RecoveryInProgress() || SS_IN_ONDEMAND_RECOVERY) { ProcBaseLockAccquire(&g_instance.proc_base_mutex_lock); /* diff --git a/src/gausskernel/storage/smgr/segment/segbuffer.cpp b/src/gausskernel/storage/smgr/segment/segbuffer.cpp index cfae8e036a2f43cdc8d78514533d218351d1e7f2..aeb17329190f8257485083b0fa5fef482dd463b9 100644 --- a/src/gausskernel/storage/smgr/segment/segbuffer.cpp +++ b/src/gausskernel/storage/smgr/segment/segbuffer.cpp @@ -344,7 +344,7 @@ void SegMarkBufferDirty(Buffer buf) #ifdef USE_ASSERT_CHECKING void SegFlushCheckDiskLSN(SegSpace *spc, RelFileNode rNode, ForkNumber forknum, BlockNumber blocknum, char *buf) { - if (!RecoveryInProgress() && ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION) { + if (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION) { char *origin_buf = (char *)palloc(BLCKSZ + ALIGNOF_BUFFER); char *temp_buf = (char *)BUFFERALIGN(origin_buf); seg_physical_read(spc, rNode, forknum, blocknum, temp_buf); @@ -526,8 +526,9 @@ Buffer ReadSegBufferForDMS(BufferDesc* bufHdr, ReadBufferMode mode, SegSpace *sp #endif } else { #ifdef USE_ASSERT_CHECKING - bool need_verify = (!RecoveryInProgress() && ((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) != 0) && - ENABLE_DSS && ENABLE_VERIFY_PAGE_VERSION); + bool need_verify = (!RecoveryInProgress() && !SS_IN_ONDEMAND_RECOVERY && + ((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) != 0) && ENABLE_DSS && + ENABLE_VERIFY_PAGE_VERSION); char *past_image = NULL; if (need_verify) { past_image = (char *)palloc(BLCKSZ); diff --git a/src/include/access/extreme_rto/batch_redo.h b/src/include/access/extreme_rto/batch_redo.h index 327fb3a746aec500caee5c52abb7632d927c903a..54d2a5be07a336eb2cba3f0137a84eb72f06c174 100644 --- a/src/include/access/extreme_rto/batch_redo.h +++ b/src/include/access/extreme_rto/batch_redo.h @@ -23,8 +23,8 @@ * --------------------------------------------------------------------------------------- */ -#ifndef BATCH_REDO_H -#define BATCH_REDO_H +#ifndef EXTREME_RTO_BATCH_REDO_H +#define EXTREME_RTO_BATCH_REDO_H #include "c.h" #include "storage/buf/block.h" @@ -69,4 +69,4 @@ extern void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoIt extern void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); } // namespace extreme_rto -#endif /* BATCH_REDO_H */ +#endif /* EXTREME_RTO_BATCH_REDO_H */ diff --git a/src/include/access/extreme_rto/dispatcher.h b/src/include/access/extreme_rto/dispatcher.h index 0f24ba42a4e55a64279b20a01a14b5a0a5851fe3..70b3a5b48904097990f72f1bfdea16d53606cba6 100644 --- a/src/include/access/extreme_rto/dispatcher.h +++ b/src/include/access/extreme_rto/dispatcher.h @@ -70,15 +70,6 @@ typedef enum { WORKER_STATE_EXITING, } ReadWorkersState; -typedef enum { - TRIGGER_NORMAL = 0, - TRIGGER_PRIMARY, - TRIGGER_STADNBY, - TRIGGER_FAILOVER, - TRIGGER_SWITCHOVER, - TRIGGER_SMARTSHUTDOWN, -} Enum_TriggeredState; - typedef enum { NONE, APPLYING, @@ -193,7 +184,6 @@ const static uint64 OUTPUT_WAIT_COUNT = 0x7FFFFFF; const static uint64 PRINT_ALL_WAIT_COUNT = 0x7FFFFFFFF; extern RedoItem g_redoEndMark; extern RedoItem g_terminateMark; -extern uint32 g_startupTriggerState; extern uint32 g_readManagerTriggerFlag; inline int get_batch_redo_num() @@ -251,13 +241,11 @@ void GetReplayedRecPtr(XLogRecPtr *startPtr, XLogRecPtr *endPtr); void StartupSendFowarder(RedoItem *item); XLogRecPtr GetSafeMinCheckPoint(); RedoWaitInfo redo_get_io_event(int32 event_id); -void redo_get_wroker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen); +void redo_get_worker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen); void CheckCommittingCsnList(); -void redo_get_wroker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); void DumpDispatcher(); } // namespace extreme_rto -extreme_rto::Enum_TriggeredState CheckForSatartupStatus(void); - #endif diff --git a/src/include/access/extreme_rto/xlog_read.h b/src/include/access/extreme_rto/xlog_read.h new file mode 100644 index 0000000000000000000000000000000000000000..98dc2271e7c3da9184efa24b00b2f787c973ea34 --- /dev/null +++ b/src/include/access/extreme_rto/xlog_read.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * xlog_read.h + * + * + * + * IDENTIFICATION + * src/include/access/extreme_rto/xlog_read.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef EXTREME_RTO_XLOG_READ_H +#define EXTREME_RTO_XLOG_READ_H + +#include "access/xlog_basic.h" + +namespace extreme_rto { +XLogRecord* XLogParallelReadNextRecord(XLogReaderState* xlogreader); +XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode); + +} // namespace extreme_rto +#endif /* EXTREME_RTO_XLOG_READ_H */ \ No newline at end of file diff --git a/src/include/access/extreme_rto_redo_api.h b/src/include/access/extreme_rto_redo_api.h new file mode 100644 index 0000000000000000000000000000000000000000..09f994d512151967c275df72b37b23f8b40b0e67 --- /dev/null +++ b/src/include/access/extreme_rto_redo_api.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * extreme_rto_redo_api.h + * + * + * IDENTIFICATION + * src/include/access/extreme_rto_redo_api.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef EXTREME_RTO_REDO_API_H +#define EXTREME_RTO_REDO_API_H + +#include "access/xlogproc.h" +#include "access/redo_statistic.h" +#include "access/ondemand_extreme_rto/redo_utils.h" + +typedef enum { + DEFAULT_EXTREME_RTO, + ONDEMAND_EXTREME_RTO, +} ExtremeRtoRedoType; + +extern ExtremeRtoRedoType g_extreme_rto_type; + +void ExtremeWaitAllReplayWorkerIdle(); +void ExtremeDispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key); +void ExtremeDispatchClosefdMarkToAllRedoWorker(); +void ExtremeRecordBadBlockAndPushToRemote(XLogBlockDataParse *datadecode, PageErrorType error_type, + XLogRecPtr old_lsn, XLogPhyBlock pblk); +void ExtremeCheckCommittingCsnList(); +XLogRecord *ExtremeReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode); +void ExtremeExtremeRtoStopHere(); +void ExtremeWaitAllRedoWorkerQueueEmpty(); +XLogRecPtr ExtremeGetSafeMinCheckPoint(); +void ExtremeClearRecoveryThreadHashTbl(const RelFileNode &node, ForkNumber forknum, BlockNumber minblkno, + bool segment_shrink); +void ExtremeBatchClearRecoveryThreadHashTbl(Oid spcNode, Oid dbNode); +bool ExtremeRedoWorkerIsUndoSpaceWorker(); +void ExtremeStartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen); +void ExtremeDispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +void ExtremeGetThreadNameIfPageRedoWorker(int argc, char *argv[], char **threadNamePtr); +PGPROC *ExtremeStartupPidGetProc(ThreadId pid); +void ExtremeUpdateStandbyState(HotStandbyState newState); +void ExtremeUpdateMinRecoveryForTrxnRedoThd(XLogRecPtr newMinRecoveryPoint); +uint32 ExtremeGetMyPageRedoWorkerIdWithLock(); +void ExtremeParallelRedoThreadMain(); +void ExtremeFreeAllocatedRedoItem(); +uint32 ExtremeGetAllWorkerCount(); +void **ExtremeGetXLogInvalidPagesFromWorkers(); +void ExtremeSendRecoveryEndMarkToWorkersAndWaitForFinish(int code); +RedoWaitInfo ExtremeRedoGetIoEvent(int32 event_id); +void ExtremeRedoGetWorkerStatistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen); +void ExtremeRedoGetWorkerTimeCount(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); +void ExtremeEndDispatcherContext(); +void ExtremeSetPageRedoWorkerIndex(int index); +int ExtremeGetPageRedoWorkerIndex(); +void ExtremeSetMyPageRedoWorker(knl_thread_arg *arg); +uint32 ExtremeGetMyPageRedoWorkerId(); +bool IsExtremeMultiThreadRedoRunning(); +bool IsExtremeRtoRunning(); +bool IsExtremeRtoSmartShutdown(); +void ExtremeRtoRedoManagerSendEndToStartup(); + +#endif \ No newline at end of file diff --git a/src/include/access/multi_redo_api.h b/src/include/access/multi_redo_api.h index 9f9da40fc17b566ca0fab431dff6b30afde604c7..1471c37e63c88a1147f2456abcd0b8bd12037411 100644 --- a/src/include/access/multi_redo_api.h +++ b/src/include/access/multi_redo_api.h @@ -33,9 +33,14 @@ #include "nodes/pg_list.h" #include "storage/proc.h" #include "access/redo_statistic.h" +#include "access/extreme_rto_redo_api.h" - - +#ifdef ENABLE_LITE_MODE +#define ENABLE_ONDEMAND_RECOVERY false +#else +#define ENABLE_ONDEMAND_RECOVERY (ENABLE_DMS && IsExtremeRedo() \ + && g_instance.attr.attr_storage.dms_attr.enable_ondemand_recovery) +#endif typedef enum { NOT_PAGE_REDO_THREAD, @@ -44,6 +49,7 @@ typedef enum { } PageRedoExitStatus; extern bool g_supportHotStandby; +extern uint32 g_startupTriggerState; const static bool SUPPORT_FPAGE_DISPATCH = true; /* support file dispatch if true, else support page dispatche */ const static bool SUPPORT_USTORE_UNDO_WORKER = true; /* support USTORE has undo redo worker, support page dispatch */ @@ -88,7 +94,6 @@ static inline bool IsMultiThreadRedo() uint32 GetRedoWorkerCount(); bool IsMultiThreadRedoRunning(); -bool IsExtremeRtoRunning(); void DispatchRedoRecord(XLogReaderState* record, List* expectedTLIs, TimestampTz recordXTime); void GetThreadNameIfMultiRedo(int argc, char* argv[], char** threadNamePtr); @@ -113,9 +118,7 @@ void FreeAllocatedRedoItem(); void** GetXLogInvalidPagesFromWorkers(); void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code); RedoWaitInfo GetRedoIoEvent(int32 event_id); -void GetRedoWrokerStatistic(uint32* realNum, RedoWorkerStatsData* worker, uint32 workerLen); -bool IsExtremeRtoSmartShutdown(); -void ExtremeRtoRedoManagerSendEndToStartup(); +void GetRedoWorkerStatistic(uint32* realNum, RedoWorkerStatsData* worker, uint32 workerLen); void CountXLogNumbers(XLogReaderState *record); void ApplyRedoRecord(XLogReaderState* record); void DiagLogRedoRecord(XLogReaderState *record, const char *funcName); diff --git a/src/include/access/ondemand_extreme_rto/batch_redo.h b/src/include/access/ondemand_extreme_rto/batch_redo.h new file mode 100644 index 0000000000000000000000000000000000000000..5abde575471415fb5341d11481d87e778f0fab44 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/batch_redo.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * batch_redo.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/batch_redo.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_BATCH_REDO_H +#define ONDEMAND_EXTREME_RTO_BATCH_REDO_H + +#include "c.h" +#include "storage/buf/block.h" +#include "storage/smgr/relfilenode.h" +#include "lib/dllist.h" +#include "utils/hsearch.h" +#include "access/xlogproc.h" +#include "access/xlogutils.h" + +namespace ondemand_extreme_rto { + +#define PAGETYPE_DROP 0x04 +#define PAGETYPE_CREATE 0x02 +#define PAGETYPE_TRUNCATE 0x01 +#define PAGETYPE_MODIFY 0x00 + +#define INITredoItemHashSIZE 1024 + +#define INIT_REDO_ITEM_TAG(a, xx_rnode, xx_forkNum, xx_blockNum) \ + ((a).rNode = (xx_rnode), (a).forkNum = (xx_forkNum), (a).blockNum = (xx_blockNum)) + +#define XlogTrackTableHashPartition(hashcode) ((hashcode) % NUM_XLOG_TRACK_PARTITIONS) +#define XlogTrackMappingPartitionLock(hashcode) \ + (&t_thrd.shemem_ptr_cxt.mainLWLockArray[FirstXlogTrackLock + XlogTrackTableHashPartition(hashcode)].lock) + +/* + * Note: if there are any pad bytes in the struct, INIT_RedoItemTag have + * to be fixed to zero them, since this struct is used as a hash key. + */ +typedef struct redoitemtag { + RelFileNode rNode; + ForkNumber forkNum; + BlockNumber blockNum; +} RedoItemTag; + +typedef struct redoitemhashentry { + RedoItemTag redoItemTag; + XLogRecParseState *head; + XLogRecParseState *tail; + int redoItemNum; + bool redoDone; +} RedoItemHashEntry; + +extern void PRPrintRedoItemHashTab(HTAB *redoItemHash); +extern HTAB **PRRedoItemHashInitialize(MemoryContext context); +extern void PRTrackClearBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); +extern void PRTrackAddBlock(XLogRecParseState *recordBlockState, HTAB *redoItemHash); +extern uint32 XlogTrackTableHashCode(RedoItemTag *tagPtr); + +} // namespace ondemand_extreme_rto +#endif /* ONDEMAND_EXTREME_RTO_BATCH_REDO_H */ diff --git a/src/include/access/ondemand_extreme_rto/dispatcher.h b/src/include/access/ondemand_extreme_rto/dispatcher.h new file mode 100644 index 0000000000000000000000000000000000000000..17f9958cfac2b1e18351cdfec002e5169d01e83d --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/dispatcher.h @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * dispatcher.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/dispatcher.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_DISPATCHER_H +#define ONDEMAND_EXTREME_RTO_DISPATCHER_H + +#include "gs_thread.h" +#include "postgres.h" +#include "knl/knl_variable.h" +#include "access/xlog.h" +#include "access/xlogreader.h" +#include "nodes/pg_list.h" +#include "storage/proc.h" +#include "access/redo_statistic.h" +#include "access/ondemand_extreme_rto/redo_item.h" +#include "access/ondemand_extreme_rto/page_redo.h" +#include "access/ondemand_extreme_rto/txn_redo.h" + +namespace ondemand_extreme_rto { + +typedef struct { + PageRedoWorker *batchThd; /* BatchRedoThread */ + PageRedoWorker *managerThd; /* PageRedoManager */ + PageRedoWorker **redoThd; /* RedoThreadPool */ + uint32 redoThdNum; + uint32 *chosedRTIds; /* chosedRedoThdIds */ + uint32 chosedRTCnt; /* chosedRedoThdCount */ +} PageRedoPipeline; + +typedef struct { + PageRedoWorker *managerThd; /* TrxnRedoManager */ + PageRedoWorker *redoThd; /* TrxnRedoWorker */ +} TrxnRedoPipeline; + +typedef struct ReadPipeline { + PageRedoWorker *managerThd; /* readthrd */ + PageRedoWorker *readPageThd; /* readthrd */ + PageRedoWorker *readThd; /* readthrd */ +} ReadPipeline; + +#define MAX_XLOG_READ_BUFFER (0xFFFFF) /* 8k uint */ + +typedef enum { + WORKER_STATE_STOP = 0, + WORKER_STATE_RUN, + WORKER_STATE_STOPPING, + WORKER_STATE_EXIT, + WORKER_STATE_EXITING, +} ReadWorkersState; + +typedef enum { + NONE, + APPLYING, + APPLIED, +} ReadBufState; + +typedef enum { + READ_MANAGER_STOP, + READ_MANAGER_RUN, +} XLogReadManagerState; + +typedef struct RecordBufferAarray { + XLogSegNo segno; + XLogRecPtr segoffset; + uint32 readlen; + char *readsegbuf; + uint32 bufState; +} RecordBufferAarray; + +#ifdef USE_ASSERT_CHECKING +#define LSN_CHECK_BUF_SIZE (128*1024*1024) +typedef struct { + uint64 curPosition; + XLogRecPtr curLsn; +#if (!defined __x86_64__) && (!defined __aarch64__) + /* protects lastReplayedReadRecPtr and lastReplayedEndRecPtr */ + slock_t ptrLck; +#endif + uint32 lsnCheckBuf[LSN_CHECK_BUF_SIZE]; +}LsnCheckCtl; + +#endif + +typedef struct RecordBufferState { + XLogReaderState *initreader; + uint32 readWorkerState; + uint32 readPageWorkerState; + uint32 readSource; + uint32 failSource; + uint32 xlogReadManagerState; + uint32 applyindex; + uint32 readindex; + RecordBufferAarray xlogsegarray[MAX_ALLOC_SEGNUM]; + char *readsegbuf; + char *readBuf; + char *errormsg_buf; + void *readprivate; + XLogRecPtr targetRecPtr; + XLogRecPtr expectLsn; + uint32 waitRedoDone; +} RecordBufferState; + +typedef struct { + MemoryContext oldCtx; + PageRedoPipeline *pageLines; + uint32 pageLineNum; /* PageLineNum */ + uint32 *chosedPageLineIds; /* chosedPageLineIds */ + uint32 chosedPLCnt; /* chosedPageLineCount */ + TrxnRedoPipeline trxnLine; + ReadPipeline readLine; + RecordBufferState rtoXlogBufState; + PageRedoWorker **allWorkers; /* Array of page redo workers. */ + uint32 allWorkersCnt; + RedoItem *freeHead; /* Head of freed-item list. */ + RedoItem *freeStateHead; + RedoItem *allocatedRedoItem; + int exitCode; /* Thread exit code. */ + uint64 totalCostTime; + uint64 txnCostTime; /* txn cost time */ + uint64 pprCostTime; + uint32 maxItemNum; + uint32 curItemNum; + + uint32 syncEnterCount; + uint32 syncExitCount; + + volatile uint32 batchThrdEnterNum; + volatile uint32 batchThrdExitNum; + + volatile uint32 segpageXactDoneFlag; + + pg_atomic_uint32 standbyState; /* sync standbyState from trxn worker to startup */ + + bool needImmediateCheckpoint; + bool needFullSyncCheckpoint; + volatile sig_atomic_t smartShutdown; +#ifdef USE_ASSERT_CHECKING + void *originLsnCheckAddr; + LsnCheckCtl *lsnCheckCtl; + slock_t updateLck; +#endif + RedoInterruptCallBackFunc oldStartupIntrruptFunc; + volatile bool recoveryStop; + volatile XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; + RedoTimeCost *startupTimeCost; + RedoParseManager parseManager; +} LogDispatcher; + +typedef struct { + bool (*rm_dispatch)(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); + bool (*rm_loginfovalid)(XLogReaderState *record, uint8 minInfo, uint8 maxInfo); + RmgrId rm_id; + uint8 rm_mininfo; + uint8 rm_maxinfo; +} RmgrDispatchData; + +extern LogDispatcher *g_dispatcher; +extern RedoItem g_GlobalLsnForwarder; +extern RedoItem g_cleanupMark; +extern THR_LOCAL RecordBufferState *g_recordbuffer; + +const static uint64 OUTPUT_WAIT_COUNT = 0x7FFFFFF; +const static uint64 PRINT_ALL_WAIT_COUNT = 0x7FFFFFFFF; +extern RedoItem g_redoEndMark; +extern RedoItem g_terminateMark; +extern uint32 g_readManagerTriggerFlag; +extern RefOperate recordRefOperate; + +inline int get_batch_redo_num() +{ + return g_instance.attr.attr_storage.batch_redo_num; +} + +inline int get_page_redo_worker_num_per_manager() +{ + return g_instance.attr.attr_storage.recovery_redo_workers_per_paser_worker; +} + +inline int get_trxn_redo_manager_num() +{ + return TRXN_REDO_MANAGER_NUM; +} + +inline int get_trxn_redo_worker_num() +{ + return TRXN_REDO_WORKER_NUM; +} + +void StartRecoveryWorkers(XLogReaderState *xlogreader, uint32 privateLen); + +/* RedoItem lifecycle. */ +void DispatchRedoRecordToFile(XLogReaderState *record, List *expectedTLIs, TimestampTz recordXTime); +void ProcessPendingRecords(bool fullSync = false); +void FreeRedoItem(RedoItem *item); + +/* Dispatcher phases. */ +void SendRecoveryEndMarkToWorkersAndWaitForFinish(int code); +void SendRecoveryEndMarkToWorkersAndWaitForReach(int code); +void WaitRedoFinish(); + +/* Dispatcher states. */ +int GetDispatcherExitCode(); +bool DispatchPtrIsNull(); +uint32 GetBatchCount(); +uint32 GetAllWorkerCount(); +PGPROC *StartupPidGetProc(ThreadId pid); +extern void SetStartupBufferPinWaitBufId(int bufid); +extern void GetStartupBufferPinWaitBufId(int *bufids, uint32 len); +void UpdateStandbyState(HotStandbyState newState); +void UpdateMinRecoveryForTrxnRedoThd(XLogRecPtr minRecoveryPoint); + +/* Redo end state saved by each page worker. */ +void **GetXLogInvalidPagesFromWorkers(); + +/* Other utility functions. */ +uint32 GetSlotId(const RelFileNode node, BlockNumber block, ForkNumber forkNum, uint32 workerCount); +bool XactHasSegpageRelFiles(XLogReaderState *record); +XLogReaderState *NewReaderState(XLogReaderState *readerState); +void FreeAllocatedRedoItem(); +List *CheckImcompleteAction(List *imcompleteActionList); +void SetPageWorkStateByThreadId(uint32 threadState); +void GetReplayedRecPtr(XLogRecPtr *startPtr, XLogRecPtr *endPtr); +void StartupSendFowarder(RedoItem *item); +XLogRecPtr GetSafeMinCheckPoint(); +RedoWaitInfo redo_get_io_event(int32 event_id); +void redo_get_worker_statistic(uint32 *realNum, RedoWorkerStatsData *worker, uint32 workerLen); +void CheckCommittingCsnList(); +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); +void DumpDispatcher(); + +} // namespace ondemand_extreme_rto + +#endif diff --git a/src/include/access/ondemand_extreme_rto/page_redo.h b/src/include/access/ondemand_extreme_rto/page_redo.h new file mode 100644 index 0000000000000000000000000000000000000000..9d55e598c2cf3679e4797b16d909aa5d5a3bd842 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/page_redo.h @@ -0,0 +1,249 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * page_redo.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/page_redo.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_PAGE_REDO_H +#define ONDEMAND_EXTREME_RTO_PAGE_REDO_H + +#include "postgres.h" +#include "knl/knl_variable.h" + +#include "access/ondemand_extreme_rto/redo_item.h" +#include "nodes/pg_list.h" +#include "storage/proc.h" + +#include "access/ondemand_extreme_rto/posix_semaphore.h" +#include "access/ondemand_extreme_rto/spsc_blocking_queue.h" +#include "access/xlogproc.h" +#include "postmaster/pagerepair.h" + +namespace ondemand_extreme_rto { + +#define ONDEMAND_DISTRIBUTE_RATIO 0.9 + +static const uint32 PAGE_WORK_QUEUE_SIZE = 2097152; + +static const uint32 ONDEMAND_EXTREME_RTO_ALIGN_LEN = 16; /* need 128-bit aligned */ +static const uint32 MAX_REMOTE_READ_INFO_NUM = 100; +static const uint32 ADVANCE_GLOBALLSN_INTERVAL = 1; /* unit second */ + +typedef enum { + REDO_BATCH, + REDO_PAGE_MNG, + REDO_PAGE_WORKER, + REDO_TRXN_MNG, + REDO_TRXN_WORKER, + REDO_READ_WORKER, + REDO_READ_PAGE_WORKER, + REDO_READ_MNG, + REDO_ROLE_NUM, +} RedoRole; + +typedef struct BadBlockRecEnt{ + RepairBlockKey key; + XLogPhyBlock pblk; + XLogRecPtr rec_min_lsn; + XLogRecPtr rec_max_lsn; + XLogRecParseState *head; + XLogRecParseState *tail; +} BadBlockRecEnt; + +struct PageRedoWorker { + /* + * The last successfully applied log record's end position + 1 as an + * atomic uint64. The type of a log record's position is XLogRecPtr. + * Here the position is stored as an uint64 so it can be read and + * written atomically. + */ + XLogRecPtr lastReplayedReadRecPtr; + XLogRecPtr lastReplayedEndRecPtr; +#if (!defined __x86_64__) && (!defined __aarch64__) + /* protects lastReplayedReadRecPtr and lastReplayedEndRecPtr */ + slock_t ptrLck; +#endif + PageRedoWorker *selfOrinAddr; + /* Worker id. */ + uint32 id; + int index; + /* Thread id */ + gs_thread_t tid; + /* The proc struct of this worker thread. */ + PGPROC *proc; + RedoRole role; + uint32 slotId; + bool isUndoSpaceWorker; + /* --------------------------------------------- + * Initial context + * + * Global variable values at worker creation time. + */ + + /* Initial server mode from the dispatcher. */ + ServerMode initialServerMode; + /* Initial timeline ID from the dispatcher. */ + TimeLineID initialTimeLineID; + List *expectedTLIs; + /* --------------------------------------------- + * Redo item queue. + * + * Redo items are provided by the dispatcher and consumed by each + * worker. See AddPageRedoItem() for the use of the additional + * pending list. + */ + + /* The head of the pending item list. */ + RedoItem *pendingHead; + /* The tail of the pending item list. */ + RedoItem *pendingTail; + /* To-be-replayed log-record-list queue. */ + SPSCBlockingQueue *queue; + + /* + * The last recovery restart point seen by the txn worker. Restart + * points before this is useless and can be removed. + */ + XLogRecPtr lastCheckedRestartPoint; + /* min recovery point */ + XLogRecPtr minRecoveryPoint; + /* --------------------------------------------- + * Per-worker run-time context + * + * States maintained by each individual page-redo worker during + * log replay. These are read by the txn-redo worker. + */ + + /* --------------------------------------------- + * Global run-time context + * + * States maintained outside page-redo worker during log replay. + * Updates to these states must be synchronized to all page-redo workers. + */ + + /* + * Global standbyState set by the txn worker. + */ + HotStandbyState standbyState; + TransactionId latestObservedXid; + bool StandbyMode; + char *DataDir; + + TransactionId RecentXmin; + /* --------------------------------------------- + * Redo end context + * + * Thread-local variable values saved after log replay has completed. + * These values are collected by each redo worker at redo end and + * are used by the dispatcher. + */ + /* XLog invalid pages. */ + void *xlogInvalidPages; + + void *committingCsnList; + + /* --------------------------------------------- + * Phase barrier. + * + * A barrier for synchronizing the dispatcher and page redo worker + * between different phases. + */ + + /* Semaphore marking the completion of the current phase. */ + PosixSemaphore phaseMarker; + MemoryContext oldCtx; + + HTAB *redoItemHash; + TimeLineID recoveryTargetTLI; + bool ArchiveRecoveryRequested; + bool StandbyModeRequested; + bool InArchiveRecovery; + bool ArchiveRestoreRequested; + bool InRecovery; + char* recoveryRestoreCommand; + uint32 fullSyncFlag; + RedoParseManager parseManager; + RedoBufferManager bufferManager; + RedoTimeCost timeCostList[TIME_COST_NUM]; + char page[BLCKSZ]; + XLogBlockDataParse *curRedoBlockState; +}; + + +extern THR_LOCAL PageRedoWorker *g_redoWorker; + +/* Worker lifecycle. */ +PageRedoWorker *StartPageRedoWorker(PageRedoWorker *worker); +void DestroyPageRedoWorker(PageRedoWorker *worker); + +/* Thread creation utility functions. */ +bool IsPageRedoWorkerProcess(int argc, char *argv[]); +void AdaptArgvForPageRedoWorker(char *argv[]); +void GetThreadNameIfPageRedoWorker(int argc, char *argv[], char **threadNamePtr); + +extern bool RedoWorkerIsUndoSpaceWorker(); +uint32 GetMyPageRedoWorkerIdWithLock(); +PGPROC *GetPageRedoWorkerProc(PageRedoWorker *worker); + +/* Worker main function. */ +void ParallelRedoThreadRegister(); +void ParallelRedoThreadMain(); + +/* Dispatcher phases. */ +bool SendPageRedoEndMark(PageRedoWorker *worker); +void WaitPageRedoWorkerReachLastMark(PageRedoWorker *worker); + +/* Redo processing. */ +void AddPageRedoItem(PageRedoWorker *worker, void *item); + +uint64 GetCompletedRecPtr(PageRedoWorker *worker); +void UpdatePageRedoWorkerStandbyState(PageRedoWorker *worker, HotStandbyState newState); + +/* Redo end states. */ +void ClearBTreeIncompleteActions(PageRedoWorker *worker); +void *GetXLogInvalidPages(PageRedoWorker *worker); +bool RedoWorkerIsIdle(PageRedoWorker *worker); +void DumpPageRedoWorker(PageRedoWorker *worker); +PageRedoWorker *CreateWorker(uint32 id); +extern void UpdateRecordGlobals(RedoItem *item, HotStandbyState standbyState); +void ReferenceRedoItem(void *item); +void DereferenceRedoItem(void *item); +void ReferenceRecParseState(XLogRecParseState *recordstate); +void DereferenceRecParseState(XLogRecParseState *recordstate); +void PushToWorkerLsn(); +void GetCompletedReadEndPtr(PageRedoWorker *worker, XLogRecPtr *readPtr, XLogRecPtr *endPtr); +void SetReadBufferForExtRto(XLogReaderState *state, XLogRecPtr pageptr, int reqLen); +void DumpExtremeRtoReadBuf(); +void PutRecordToReadQueue(XLogReaderState *recordreader); +bool LsnUpdate(); +void ResetRtoXlogReadBuf(XLogRecPtr targetPagePtr); +bool XLogPageReadForExtRto(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen); +void ExtremeRtoStopHere(); +void WaitAllRedoWorkerQueueEmpty(); +void WaitAllReplayWorkerIdle(); +void DispatchClosefdMarkToAllRedoWorker(); +void DispatchCleanInvalidPageMarkToAllRedoWorker(RepairFileKey key); + +const char *RedoWokerRole2Str(RedoRole role); + +} // namespace ondemand_extreme_rto +#endif diff --git a/src/include/access/ondemand_extreme_rto/posix_semaphore.h b/src/include/access/ondemand_extreme_rto/posix_semaphore.h new file mode 100644 index 0000000000000000000000000000000000000000..43cb682a4fc17f0555f4adc77088bb5b5e13c1d8 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/posix_semaphore.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * posix_semaphore.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/posix_semaphore.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_POSIX_SEMAPHORE_H +#define ONDEMAND_EXTREME_RTO_POSIX_SEMAPHORE_H + +#include +namespace ondemand_extreme_rto { + +typedef struct { + sem_t semaphore; + bool initialized; +} PosixSemaphore; + +void PosixSemaphoreInit(PosixSemaphore *sem, unsigned int initValue); +void PosixSemaphoreDestroy(PosixSemaphore *sem); +void PosixSemaphoreWait(PosixSemaphore *sem); +void PosixSemaphorePost(PosixSemaphore *sem); +} // namespace ondemand_extreme_rto +#endif diff --git a/src/include/access/ondemand_extreme_rto/redo_item.h b/src/include/access/ondemand_extreme_rto/redo_item.h new file mode 100644 index 0000000000000000000000000000000000000000..069d5d101482ac16667923631db26964a5eee48c --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/redo_item.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * redo_item.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/redo_item.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_REDO_ITEM_H +#define ONDEMAND_EXTREME_RTO_REDO_ITEM_H + +#include "access/xlogreader.h" +#include "datatype/timestamp.h" +#include "nodes/pg_list.h" +#include "utils/atomic.h" +#include "storage/buf/block.h" +#include "storage/smgr/relfilenode.h" + +#include "access/ondemand_extreme_rto/posix_semaphore.h" +#include "replication/replicainternal.h" + +namespace ondemand_extreme_rto { + +typedef struct RedoItem_s { + bool needImmediateCheckpoint; + bool needFullSyncCheckpoint; + /* The expected timelines for this record. */ + List *expectedTLIs; + /* The timestamp of the log record if it is a transaction record. */ + TimestampTz recordXTime; + /* Next item on the free list. */ + struct RedoItem_s *freeNext; + /* A "deep" copy of the log record. */ + XLogReaderState record; + /* Used for really free */ + struct RedoItem_s *allocatedNext; + TimestampTz syncXLogReceiptTime; + int syncXLogReceiptSource; + TransactionId RecentXmin; + ServerMode syncServerMode; +} RedoItem; + +static const int32 ANY_BLOCK_ID = -1; +static const uint32 ANY_WORKER = (uint32)-1; +static const uint32 TRXN_WORKER = (uint32)-2; +static const uint32 ALL_WORKER = (uint32)-3; +static const uint32 USTORE_WORKER = (uint32)-4; + +void DumpItem(RedoItem *item, const char *funcName); + +static inline RedoItem *GetRedoItemPtr(XLogReaderState *record) +{ + return (RedoItem *)(((char *)record) - offsetof(RedoItem, record)); +} + +} // namespace ondemand_extreme_rto + +#endif diff --git a/src/include/access/ondemand_extreme_rto/redo_utils.h b/src/include/access/ondemand_extreme_rto/redo_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..8b2775785e1ca58da523f2d2d852090b03a79cc7 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/redo_utils.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * redo_utils.h + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/redo_utils.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_REDO_UTILS_H +#define ONDEMAND_EXTREME_RTO_REDO_UTILS_H + +#include "access/xlogproc.h" + +#define PARSEBUFFER_SIZE (sizeof(XLogRecParseState) + sizeof(ParseBufferDesc)) +#define ONDEMAND_MAX_PARSEBUFF_PREPALLOC ((1024 * 1024 * 1024 - 1) / PARSEBUFFER_SIZE) +#define ONDEMAND_MAX_PARSESIZE_PREPALLOC (ONDEMAND_MAX_PARSEBUFF_PREPALLOC * PARSEBUFFER_SIZE) +#define ONDEMAND_MAX_PARSEBUFF_ALLOCSIZE 100 // 100GB + +typedef struct +{ + int allocNum; + void *allocEntry[ONDEMAND_MAX_PARSEBUFF_ALLOCSIZE]; + void *memslotEntry; +} OndemandParseAllocCtrl; + + +void OndemandXLogParseBufferInit(RedoParseManager *parsemanager, int buffernum, RefOperate *refOperate, + InterruptFunc interruptOperte); +void OndemandXLogParseBufferDestory(RedoParseManager *parsemanager); +XLogRecParseState *OndemandXLogParseBufferAllocList(RedoParseManager *parsemanager, XLogRecParseState *blkstatehead, + void *record); +void OndemandXLogParseBufferRelease(XLogRecParseState *recordstate); +void OnDemandSendRecoveryEndMarkToWorkersAndWaitForReach(int code); +void OnDemandWaitRedoFinish(); + +#endif /* ONDEMAND_EXTREME_RTO_REDO_UTILS_H */ \ No newline at end of file diff --git a/src/include/access/ondemand_extreme_rto/spsc_blocking_queue.h b/src/include/access/ondemand_extreme_rto/spsc_blocking_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..e0bb268726074777f1a4a0416d50c0b19604f69c --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/spsc_blocking_queue.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * spsc_blocking_queue.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/spsc_blocking_queue.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_SPSC_BLOCKING_QUEUE_H +#define ONDEMAND_EXTREME_RTO_SPSC_BLOCKING_QUEUE_H + +#include "postgres.h" +#include "knl/knl_variable.h" +#include "access/parallel_recovery/posix_semaphore.h" + + +namespace ondemand_extreme_rto { +typedef void (*CallBackFunc)(); + +struct SPSCBlockingQueue { + pg_atomic_uint32 writeHead; /* Array index for the next write. */ + pg_atomic_uint32 readTail; /* Array index for the next read. */ + uint32 capacity; /* Queue capacity, must be power of 2. */ + uint32 mask; /* Bit mask for computing index. */ + pg_atomic_uint32 maxUsage; + pg_atomic_uint64 totalCnt; + CallBackFunc callBackFunc; + uint64 lastTotalCnt; + void *buffer[1]; /* Queue buffer, the actual size is capacity. */ +}; + +SPSCBlockingQueue *SPSCBlockingQueueCreate(uint32 capacity, CallBackFunc func = NULL); +void SPSCBlockingQueueDestroy(SPSCBlockingQueue *queue); + +bool SPSCBlockingQueuePut(SPSCBlockingQueue *queue, void *element); +void *SPSCBlockingQueueTake(SPSCBlockingQueue *queue); +bool SPSCBlockingQueueIsEmpty(SPSCBlockingQueue *queue); +void *SPSCBlockingQueueTop(SPSCBlockingQueue *queue); +void SPSCBlockingQueuePop(SPSCBlockingQueue *queue); +void DumpQueue(const SPSCBlockingQueue *queue); +uint32 SPSCGetQueueCount(SPSCBlockingQueue *queue); +bool SPSCBlockingQueueGetAll(SPSCBlockingQueue *queue, void ***eleArry, uint32 *eleNum); +void SPSCBlockingQueuePopN(SPSCBlockingQueue *queue, uint32 n); +} // namespace ondemand_extreme_rto +#endif diff --git a/src/include/access/ondemand_extreme_rto/txn_redo.h b/src/include/access/ondemand_extreme_rto/txn_redo.h new file mode 100644 index 0000000000000000000000000000000000000000..5bd1a8b9c1ec7f5cba54df38a3211726c262a2ae --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/txn_redo.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2020 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * txn_redo.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/txn_redo.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_TXN_REDO_H +#define ONDEMAND_EXTREME_RTO_TXN_REDO_H + +#include "access/parallel_recovery/redo_item.h" + +namespace ondemand_extreme_rto { +void AddTxnRedoItem(PageRedoWorker *worker, void *item); +void TrxnMngProc(RedoItem *item, PageRedoWorker *wk); +void TrxnWorkerProc(RedoItem *item); +} // namespace ondemand_extreme_rto +#endif diff --git a/src/include/access/ondemand_extreme_rto/xlog_read.h b/src/include/access/ondemand_extreme_rto/xlog_read.h new file mode 100644 index 0000000000000000000000000000000000000000..6642a013e8d9770b075df028e308bef9f900afe7 --- /dev/null +++ b/src/include/access/ondemand_extreme_rto/xlog_read.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2023 Huawei Technologies Co.,Ltd. + * + * openGauss is licensed under Mulan PSL v2. + * You can use this software according to the terms and conditions of the Mulan PSL v2. + * You may obtain a copy of Mulan PSL v2 at: + * + * http://license.coscl.org.cn/MulanPSL2 + * + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, + * EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, + * MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. + * See the Mulan PSL v2 for more details. + * --------------------------------------------------------------------------------------- + * + * xlog_read.h + * + * + * + * IDENTIFICATION + * src/include/access/ondemand_extreme_rto/xlog_read.h + * + * --------------------------------------------------------------------------------------- + */ + +#ifndef ONDEMAND_EXTREME_RTO_XLOG_READ_H +#define ONDEMAND_EXTREME_RTO_XLOG_READ_H + +#include "access/xlog_basic.h" + +namespace ondemand_extreme_rto { +XLogRecord* XLogParallelReadNextRecord(XLogReaderState* xlogreader); +XLogRecord *ReadNextXLogRecord(XLogReaderState **xlogreaderptr, int emode); + +} // namespace ondemand_extreme_rto +#endif /* ONDEMAND_EXTREME_RTO_XLOG_READ_H */ \ No newline at end of file diff --git a/src/include/access/parallel_recovery/dispatcher.h b/src/include/access/parallel_recovery/dispatcher.h index 45474bcf69cf8f2302414e3505341ea12b3ee10e..cad5d9481aef597045da4e017da3957049ec7d8c 100644 --- a/src/include/access/parallel_recovery/dispatcher.h +++ b/src/include/access/parallel_recovery/dispatcher.h @@ -126,7 +126,7 @@ void GetReplayedRecPtrFromUndoWorkers(XLogRecPtr *readPtr, XLogRecPtr *endPtr); List* CheckImcompleteAction(List* imcompleteActionList); void SetPageWorkStateByThreadId(uint32 threadState); RedoWaitInfo redo_get_io_event(int32 event_id); -void redo_get_wroker_statistic(uint32* realNum, RedoWorkerStatsData* worker, uint32 workerLen); +void redo_get_worker_statistic(uint32* realNum, RedoWorkerStatsData* worker, uint32 workerLen); extern void redo_dump_all_stats(); void WaitRedoWorkerIdle(); void SendClearMarkToAllWorkers(); @@ -139,7 +139,7 @@ extern void InitReaderStateByOld(XLogReaderState *newState, XLogReaderState *old extern void CopyDataFromOldReader(XLogReaderState *newReaderState, XLogReaderState *oldReaderState); bool TxnQueueIsEmpty(TxnRedoWorker* worker); -void redo_get_wroker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); +void redo_get_worker_time_count(RedoWorkerTimeCountsInfo **workerCountInfoList, uint32 *realNum); } diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 3b991f20d1b6819abdb45a8931c629349df198b1..314357fc299f9129da85d0753b05ee0003ddc945 100755 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -64,6 +64,15 @@ typedef enum { STANDBY_SNAPSHOT_READY } HotStandbyState; +typedef enum { + TRIGGER_NORMAL = 0, + TRIGGER_PRIMARY, + TRIGGER_STADNBY, + TRIGGER_FAILOVER, + TRIGGER_SWITCHOVER, + TRIGGER_SMARTSHUTDOWN, +} Enum_TriggeredState; + #define InHotStandby (t_thrd.xlog_cxt.standbyState >= STANDBY_SNAPSHOT_PENDING) #define DUMMYSTANDBY_CONNECT_INTERVAL 3 // unit second @@ -532,6 +541,8 @@ typedef struct XLogCtlData { bool SharedRecoveryInProgress; bool IsRecoveryDone; + bool IsOnDemandBuildDone; + bool IsOnDemandRecoveryDone; /* * SharedHotStandbyActive indicates if we're still in crash or archive @@ -810,7 +821,6 @@ extern char* TrimStr(const char* str); extern void CloseXlogFilesAtThreadExit(void); extern void SetLatestXTime(TimestampTz xtime); -XLogRecord* XLogParallelReadNextRecord(XLogReaderState* xlogreader); void ResourceManagerStartup(void); void ResourceManagerStop(void); @@ -855,6 +865,16 @@ bool CheckForSwitchoverTrigger(void); void HandleCascadeStandbyPromote(XLogRecPtr *recptr); void update_dirty_page_queue_rec_lsn(XLogRecPtr current_insert_lsn, bool need_immediately_update = false); XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int whichChkpt); +int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +bool timeLineInHistory(TimeLineID tli, List *expectedTLEs); +Enum_TriggeredState CheckForSatartupStatus(void); +bool CheckForStandbyTrigger(void); +void UpdateMinrecoveryInAchive(); +bool NewDataIsInBuf(XLogRecPtr expectedRecPtr); +bool rescanLatestTimeLine(void); +int XLogFileReadAnyTLI(XLogSegNo segno, int emode, uint32 sources); +int SSXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen, + XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI, char* xlog_path); extern XLogRecPtr XlogRemoveSegPrimary; diff --git a/src/include/access/xlog_basic.h b/src/include/access/xlog_basic.h index 31b526292d967dae9e62d0efd185f0a0cea262de..e03b585bdc346cd7508b585112b2a1243c968e25 100644 --- a/src/include/access/xlog_basic.h +++ b/src/include/access/xlog_basic.h @@ -99,6 +99,7 @@ * The XLog directory and control file (relative to $PGDATA) */ #define SS_XLOGDIR (g_instance.datadir_cxt.xlogDir) +#define SS_XLOGRECOVERYDIR (g_instance.dms_cxt.SSRecoveryInfo.recovery_xlogDir) #define XLOGDIR "pg_xlog" #define ARCHIVEDIR "pg_xlog/archive_status" #define XLOG_CONTROL_FILE (g_instance.datadir_cxt.controlPath) diff --git a/src/include/access/xlogproc.h b/src/include/access/xlogproc.h index e144fefcc8f6393bf781af7641aa4a9194719764..5b5a05df911de351e542bf9426de7a9fae23a963 100755 --- a/src/include/access/xlogproc.h +++ b/src/include/access/xlogproc.h @@ -111,6 +111,7 @@ typedef struct { typedef struct { Buffer buff_id; pg_atomic_uint32 state; + pg_atomic_uint32 refcount; } ParseBufferDesc; #define RedoBufferSlotGetBuffer(bslot) ((bslot)->buf_id) @@ -687,7 +688,12 @@ typedef struct RefOperate *refOperate; }RedoParseManager; - +typedef enum { + XLOG_NO_DISTRIBUTE, + XLOG_HEAD_DISTRIBUTE, + XLOG_MID_DISTRIBUTE, + XLOG_TAIL_DISTRIBUTE, +} XlogDistributePos; typedef struct { void* nextrecord; @@ -695,6 +701,7 @@ typedef struct { RedoParseManager* manager; void* refrecord; /* origin dataptr, for mem release */ bool isFullSync; + XlogDistributePos distributeStatus; } XLogRecParseState; typedef struct XLogBlockRedoExtreRto { @@ -908,7 +915,36 @@ extern AbnormalProcFunc g_AbFunList[ABNORMAL_NUM]; #define ADD_ABNORMAL_POSITION(pos) #endif +static inline bool AtomicCompareExchangeBuffer(volatile Buffer *ptr, Buffer *expected, Buffer newval) +{ + bool ret = false; + Buffer current; + current = __sync_val_compare_and_swap(ptr, *expected, newval); + ret = (current == *expected); + *expected = current; + return ret; +} + +static inline Buffer AtomicReadBuffer(volatile Buffer *ptr) +{ + return *ptr; +} +static inline void AtomicWriteBuffer(volatile Buffer* ptr, Buffer val) +{ + *ptr = val; +} + +static inline Buffer AtomicExchangeBuffer(volatile Buffer *ptr, Buffer newval) +{ + Buffer old; + while (true) { + old = AtomicReadBuffer(ptr); + if (AtomicCompareExchangeBuffer(ptr, &old, newval)) + break; + } + return old; +} void HeapXlogCleanOperatorPage( RedoBufferInfo* buffer, void* recorddata, void* blkdata, Size datalen, Size* freespace, bool repairFragmentation); @@ -1204,6 +1240,7 @@ extern XLogRecParseState* xact_redo_parse_to_block(XLogReaderState* record, uint extern bool XLogBlockRedoForExtremeRTO(XLogRecParseState* redoblocktate, RedoBufferInfo *bufferinfo, bool notfound, RedoTimeCost &readBufCost, RedoTimeCost &redoCost); +extern void XlogBlockRedoForOndemandExtremeRTOQuery(XLogRecParseState *redoBlockState, RedoBufferInfo *bufferInfo); void XLogBlockParseStateRelease_debug(XLogRecParseState* recordstate, const char *func, uint32 line); #define XLogBlockParseStateRelease(recordstate) XLogBlockParseStateRelease_debug(recordstate, __FUNCTION__, __LINE__) #ifdef USE_ASSERT_CHECKING diff --git a/src/include/ddes/dms/ss_common_attr.h b/src/include/ddes/dms/ss_common_attr.h index 687df87ecb002db93c440139651c8f11ebdebebf..8289a696f088ecaf67335864200c06b1bdcc665b 100644 --- a/src/include/ddes/dms/ss_common_attr.h +++ b/src/include/ddes/dms/ss_common_attr.h @@ -140,6 +140,17 @@ #define SS_PRIMARY_STANDBY_CLUSTER_NORMAL_STANDBY \ (SS_NORMAL_STANDBY && (g_instance.attr.attr_storage.xlog_file_path != 0)) +#define SS_CLUSTER_NOT_NORAML (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus != CLUSTER_NORMAL)) +#define SS_CLUSTER_ONDEMAND_BUILD \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_IN_ONDEMAND_BUILD)) +#define SS_CLUSTER_ONDEMAND_RECOVERY \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_IN_ONDEMAND_RECOVERY)) +#define SS_CLUSTER_ONDEMAND_NORMAL \ + (ENABLE_DMS && (g_instance.dms_cxt.SSReformerControl.clusterStatus == CLUSTER_NORMAL)) +#define SS_STANDBY_ONDEMAND_BUILD (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_BUILD) +#define SS_STANDBY_ONDEMAND_RECOVERY (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_RECOVERY) +#define SS_STANDBY_ONDEMAND_NORMAL (SS_STANDBY_MODE && SS_CLUSTER_ONDEMAND_NORMAL) + /* DMS_BUF_NEED_LOAD */ #define BUF_NEED_LOAD 0x1 /* DMS_BUF_IS_LOADED */ @@ -207,5 +218,18 @@ typedef enum SSReformType { DMS_REFORM_TYPE_FOR_MAINTAIN } SSReformType; +typedef enum SSGlobalClusterState { + CLUSTER_IN_ONDEMAND_BUILD = 0, + CLUSTER_IN_ONDEMAND_RECOVERY, + CLUSTER_NORMAL +} SSGlobalClusterState; + +typedef enum SSOndemandRequestRedoStatus { + ONDEMAND_REDO_DONE = 0, + ONDEMAND_REDO_SKIP, + ONDEMAND_REDO_FAIL, + ONDEMAND_REDO_INVALID +} SSOndemandRequestRedoStatus; + #endif diff --git a/src/include/ddes/dms/ss_dms.h b/src/include/ddes/dms/ss_dms.h index 3a1d3363d875aa27ade0b8a3d547f966abdb7ab8..3430f499b5b7480bb84c387a2fe41be40c30c8b0 100644 --- a/src/include/ddes/dms/ss_dms.h +++ b/src/include/ddes/dms/ss_dms.h @@ -80,6 +80,8 @@ typedef struct st_ss_dms_func { void (*dms_refresh_logger)(char *log_field, unsigned long long *value); void (*dms_validate_drc)(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned long long lsn, unsigned char is_dirty); + int (*dms_reform_req_opengauss_ondemand_redo_buffer)(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status); } ss_dms_func_t; int ss_dms_func_init(); @@ -123,6 +125,8 @@ bool dms_latch_timed_s(dms_context_t *dms_ctx, dms_drlatch_t *dlatch, unsigned i void dms_unlatch(dms_context_t *dms_ctx, dms_drlatch_t *dlatch); void dms_pre_uninit(void); void dms_validate_drc(dms_context_t *dms_ctx, dms_buf_ctrl_t *ctrl, unsigned long long lsn, unsigned char is_dirty); +int dms_reform_req_opengauss_ondemand_redo_buffer(dms_context_t *dms_ctx, void *block_key, unsigned int key_len, + int *redo_status); #ifdef __cplusplus } #endif diff --git a/src/include/ddes/dms/ss_dms_bufmgr.h b/src/include/ddes/dms/ss_dms_bufmgr.h index 9d10736f67fc6d1f2c86e2a8e870842a06dff4bb..8807ad54b7319b5759c0bdc38090902c48bd9015 100644 --- a/src/include/ddes/dms/ss_dms_bufmgr.h +++ b/src/include/ddes/dms/ss_dms_bufmgr.h @@ -83,4 +83,5 @@ long SSGetBufSleepTime(int retry_times); SMGR_READ_STATUS SmgrNetPageCheckRead(Oid spcNode, Oid dbNode, Oid relNode, ForkNumber forkNum, BlockNumber blockNo, char *blockbuf); void SSUnPinBuffer(BufferDesc* buf_desc); +bool SSOndemandRequestPrimaryRedo(BufferTag tag); #endif diff --git a/src/include/ddes/dms/ss_dms_recovery.h b/src/include/ddes/dms/ss_dms_recovery.h index 6affe3c2c19044ea30fc7ca446221dc61327331f..265362c4105c62de9e3549acac6669464d08611b 100644 --- a/src/include/ddes/dms/ss_dms_recovery.h +++ b/src/include/ddes/dms/ss_dms_recovery.h @@ -32,10 +32,29 @@ #define SS_BEFORE_RECOVERY (ENABLE_DMS && g_instance.dms_cxt.SSReformInfo.in_reform == true \ && g_instance.dms_cxt.SSRecoveryInfo.recovery_pause_flag == true) #define SS_IN_FAILOVER (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.in_failover == true) +#define SS_IN_ONDEMAND_RECOVERY (ENABLE_DMS && g_instance.dms_cxt.SSRecoveryInfo.in_ondemand_recovery == true) +#define SS_ONDEMAND_BUILD_DONE (ENABLE_DMS && SS_IN_ONDEMAND_RECOVERY \ + && t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone == true) +#define SS_ONDEMAND_RECOVERY_DONE (ENABLE_DMS && SS_IN_ONDEMAND_RECOVERY \ + && t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone == true) +#define SS_REPLAYED_BY_ONDEMAND (ENABLE_DMS && !SS_IN_ONDEMAND_RECOVERY && \ + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandBuildDone == true && \ + t_thrd.shemem_ptr_cxt.XLogCtl->IsOnDemandRecoveryDone == true) + +#define REFORM_CTRL_VERSION 1 + +typedef struct st_old_reformer_ctrl { + uint64 list_stable; // stable instances list + int primaryInstId; + pg_crc32c crc; +} ss_old_reformer_ctrl_t; typedef struct st_reformer_ctrl { + uint32 version; uint64 list_stable; // stable instances list int primaryInstId; + int recoveryInstId; + SSGlobalClusterState clusterStatus; pg_crc32c crc; } ss_reformer_ctrl_t; @@ -66,14 +85,14 @@ typedef struct ss_recovery_info { bool no_backend_left; bool startup_need_exit_normally; //used in alive failover bool recovery_trapped_in_page_request; //used in alive failover + bool in_ondemand_recovery; } ss_recovery_info_t; extern bool SSRecoveryNodes(); extern void SSWaitStartupExit(); extern int SSGetPrimaryInstId(); extern void SSSavePrimaryInstId(int id); -extern void SSReadControlFile(int id, bool updateDmsCtx = false); -extern void SSWriteReformerControlPages(void); +extern void SSInitReformerControlPages(void); extern bool SSRecoveryApplyDelay(); extern void SShandle_promote_signal(); extern void ss_failover_dw_init(); diff --git a/src/include/ddes/dms/ss_init.h b/src/include/ddes/dms/ss_init.h index ed83a51b6a6db25bedc3803ebc55faf638b9b06a..28451a991965e71a2dc0fcb6a3e6a74b6381509b 100644 --- a/src/include/ddes/dms/ss_init.h +++ b/src/include/ddes/dms/ss_init.h @@ -32,8 +32,10 @@ #define DMS_MAX_CONNECTIONS (int32)16000 #define SS_PRIMARY_ID g_instance.dms_cxt.SSReformerControl.primaryInstId // currently master ID is hardcoded as 0 +#define SS_RECOVERY_ID g_instance.dms_cxt.SSReformerControl.recoveryInstId #define SS_MY_INST_ID g_instance.attr.attr_storage.dms_attr.instance_id #define SS_OFFICIAL_PRIMARY (SS_MY_INST_ID == SS_PRIMARY_ID) +#define SS_OFFICIAL_RECOVERY_NODE (SS_MY_INST_ID == SS_RECOVERY_ID) void DMSInit(); void DMSUninit(); diff --git a/src/include/ddes/dms/ss_reform_common.h b/src/include/ddes/dms/ss_reform_common.h index 40cad18845e1f19e606bb424d68b4aa11c323d91..a934c3f3743b78e7d5edd751d2a1882ae3469b42 100644 --- a/src/include/ddes/dms/ss_reform_common.h +++ b/src/include/ddes/dms/ss_reform_common.h @@ -31,14 +31,18 @@ #define REFORM_WAIT_LONG 100000 /* 0.1 sec */ #define WAIT_REFORM_CTRL_REFRESH_TRIES 1000 +#define REFORM_CTRL_VERSION 1 + typedef struct SSBroadcastCancelTrx { SSBroadcastOp type; // must be first } SSBroadcastCancelTrx; -bool SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf); +int SSReadXlogInternal(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, XLogRecPtr targetRecPtr, char *buf, + int readLen); XLogReaderState *SSXLogReaderAllocate(XLogPageReadCB pagereadfunc, void *private_data, Size alignedSize); -void SSGetXlogPath(); -void SSSaveReformerCtrl(); +void SSGetRecoveryXlogPath(); +void SSSaveReformerCtrl(bool force = false); +void SSReadControlFile(int id, bool updateDmsCtx = false); void SSClearSegCache(); int SSCancelTransactionOfAllStandby(SSBroadcastOp type); int SSProcessCancelTransaction(SSBroadcastOp type); diff --git a/src/include/knl/knl_guc/knl_instance_attr_storage.h b/src/include/knl/knl_guc/knl_instance_attr_storage.h index f113c865e7233c3a9d6dd6b5cb579c1a8ed49316..003a59bf9b63bd2467e266767cbdd8237ba44efc 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_storage.h +++ b/src/include/knl/knl_guc/knl_instance_attr_storage.h @@ -100,6 +100,8 @@ typedef struct knl_instance_attr_dms { bool enable_catalog_centralized; bool enable_dss_aio; bool enable_verify_page; + bool enable_ondemand_recovery; + int ondemand_recovery_mem_size; int instance_id; int recv_msg_pool_size; char* interconnect_url; diff --git a/src/include/knl/knl_instance.h b/src/include/knl/knl_instance.h index c9aa5edd2049fae639baf0ba6ad200dff7d2fab6..c3096bb84834be5b118e217991d15a707a67484b 100755 --- a/src/include/knl/knl_instance.h +++ b/src/include/knl/knl_instance.h @@ -741,6 +741,8 @@ typedef struct knl_g_parallel_redo_context { char* ali_buf; XLogRedoNumStatics xlogStatics[RM_NEXT_ID][MAX_XLOG_INFO_NUM]; RedoCpuBindControl redoCpuBindcontrl; + + HTAB **redoItemHash; /* used in ondemand extreme RTO */ } knl_g_parallel_redo_context; typedef struct knl_g_heartbeat_context { @@ -827,7 +829,7 @@ typedef struct knl_g_comm_context { long lastArchiveRcvTime; void* pLogCtl; bool rejectRequest; - + MemoryContext redoItemCtx; #ifdef USE_SSL libcomm_sslinfo* libcomm_data_port_list; libcomm_sslinfo* libcomm_ctrl_port_list; diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index 979580bfbf429ec6132ce493f040fbb511103bf1..f141b06d8c79ddafece76a22ca857f51d6023164 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -3355,6 +3355,12 @@ typedef struct knl_t_dms_context { bool flush_copy_get_page_failed; //used in flush copy } knl_t_dms_context; +typedef struct knl_t_ondemand_xlog_copy_context { + int openLogFile; + XLogSegNo openLogSegNo; + uint32 openLogOff; +} knl_t_ondemand_xlog_copy_context; + /* thread context. */ typedef struct knl_thrd_context { knl_thread_role role; @@ -3503,6 +3509,7 @@ typedef struct knl_thrd_context { knl_t_cfs_shrinker_context cfs_shrinker_cxt; knl_t_sql_patch_context sql_patch_cxt; knl_t_dms_context dms_cxt; + knl_t_ondemand_xlog_copy_context ondemand_xlog_copy_cxt; knl_t_rc_context rc_cxt; } knl_thrd_context; diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 1efaf07fec721e4879f4f77e066b363912edab97..47f54f789ef9341a3b0d1386dffd0ae4cad60a50 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -37,6 +37,7 @@ /***************************************************************************** * Backend version and inplace upgrade staffs *****************************************************************************/ +extern const uint32 ONDEMAND_REDO_VERSION_NUM; extern const uint32 SRF_FUSION_VERSION_NUM; extern const uint32 INNER_UNIQUE_VERSION_NUM; extern const uint32 PARTITION_ENHANCE_VERSION_NUM; @@ -131,6 +132,7 @@ extern const uint32 CREATE_TABLE_AS_VERSION_NUM; extern void register_backend_version(uint32 backend_version); extern bool contain_backend_version(uint32 version_number); +extern void SSUpgradeFileBeforeCommit(); #define INPLACE_UPGRADE_PRECOMMIT_VERSION 1 @@ -402,6 +404,7 @@ extern bool stack_is_too_deep(void); /* in tcop/utility.c */ extern void PreventCommandIfReadOnly(const char* cmdname); extern void PreventCommandDuringRecovery(const char* cmdname); +extern void PreventCommandDuringSSOndemandRecovery(Node* parseTree); extern int trace_recovery(int trace_level); diff --git a/src/include/replication/logical.h b/src/include/replication/logical.h index 75d491a4d3edd1e97d90ffd521319eaba894053d..ed8704f10b0bb2409e1c5eda2c8d8e17da0654a7 100644 --- a/src/include/replication/logical.h +++ b/src/include/replication/logical.h @@ -21,14 +21,8 @@ #include "replication/output_plugin.h" #include "postgres.h" #include "knl/knl_variable.h" - -#include "access/extreme_rto/redo_item.h" #include "nodes/pg_list.h" #include "storage/proc.h" - - -#include "access/extreme_rto/posix_semaphore.h" -#include "access/extreme_rto/spsc_blocking_queue.h" #include "access/parallel_recovery/redo_item.h" #include "nodes/parsenodes_common.h" diff --git a/src/include/storage/buf/bufmgr.h b/src/include/storage/buf/bufmgr.h index 255f3d5faa563037c46fea3823cfc0012fbe6560..a02c2837d9d21765ccff06815e1722807ced74e9 100644 --- a/src/include/storage/buf/bufmgr.h +++ b/src/include/storage/buf/bufmgr.h @@ -420,5 +420,6 @@ extern bool StartBufferIO(BufferDesc* buf, bool forInput); extern Buffer ReadBuffer_common_for_dms(ReadBufferMode readmode, BufferDesc *bufDesc, const XLogPhyBlock *pblk); extern void ReadBuffer_common_for_check(ReadBufferMode readmode, BufferDesc* buf_desc, const XLogPhyBlock *pblk, Block bufBlock); - +extern BufferDesc *RedoForOndemandExtremeRTOQuery(BufferDesc *bufHdr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode); #endif diff --git a/src/include/storage/lock/lwlock.h b/src/include/storage/lock/lwlock.h index 0ab2e3a2b0015bbd64d187bceaf2e4b1636a94e4..92858f0dde09aea56ad91c8ed13d7aa35d40556a 100644 --- a/src/include/storage/lock/lwlock.h +++ b/src/include/storage/lock/lwlock.h @@ -128,6 +128,9 @@ const struct LWLOCK_PARTITION_DESC LWLockPartInfo[] = { /* Number of partions of the segment head buffer */ #define NUM_SEGMENT_HEAD_PARTITIONS 128 +/* Number of partitions of the redo xlog track mapping hashtable */ +#define NUM_XLOG_TRACK_PARTITIONS 4096 + /* Number of partions the session roleid hashtable */ #define NUM_SESSION_ROLEID_PARTITIONS 128 @@ -190,8 +193,9 @@ const struct LWLOCK_PARTITION_DESC LWLockPartInfo[] = { #define FirstGPRCMappingLock (FirstSessRoleIdLock + NUM_SESSION_ROLEID_PARTITIONS) /* standby statement history */ #define FirstStandbyStmtHistLock (FirstGPRCMappingLock + NUM_GPRC_PARTITIONS) +#define FirstXlogTrackLock (FirstStandbyStmtHistLock + NUM_STANDBY_STMTHIST_PARTITIONS) /* must be last: */ -#define NumFixedLWLocks (FirstStandbyStmtHistLock + NUM_STANDBY_STMTHIST_PARTITIONS) +#define NumFixedLWLocks (FirstXlogTrackLock + NUM_XLOG_TRACK_PARTITIONS) /* * WARNING----Please keep BuiltinTrancheIds and BuiltinTrancheNames consistent!!! * @@ -270,6 +274,7 @@ enum BuiltinTrancheIds LWTRANCHE_REPLICATION_ORIGIN, LWTRANCHE_AUDIT_INDEX_WAIT, LWTRANCHE_PCA_BUFFER_CONTENT, + LWTRANCHE_XLOG_TRACK_PARTITION, /* * Each trancheId above should have a corresponding item in BuiltinTrancheNames; */ diff --git a/src/include/utils/elog.h b/src/include/utils/elog.h index ee324efb8c4cfc99de8c361769cfe4e20921228d..b3762f7d3c3a3d2de66940276089a88ba50ae819 100644 --- a/src/include/utils/elog.h +++ b/src/include/utils/elog.h @@ -638,8 +638,13 @@ extern void write_stderr(const char* fmt, ...) the supplied arguments. */ __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); -extern void getElevelAndSqlstate(int* eLevel, int* sqlState); +extern void write_stderr_with_prefix(const char* fmt, ...) + /* This extension allows gcc to check the format string for consistency with + the supplied arguments. */ + __attribute__((format(PG_PRINTF_ATTRIBUTE, 1, 2))); +extern void getElevelAndSqlstate(int* eLevel, int* sqlState); +extern void get_time_now(char* nowTime, int timeLen); void freeSecurityFuncSpace(char* charList, ...); extern void SimpleLogToServer(int elevel, bool silent, const char* fmt, ...) diff --git a/src/test/regress/output/recovery_2pc_tools.source b/src/test/regress/output/recovery_2pc_tools.source index 43742d5133b23c2ca0de99ddbc9a5005762a5481..3d9140e012874dc338d2afae06eb827010bf8356 100644 --- a/src/test/regress/output/recovery_2pc_tools.source +++ b/src/test/regress/output/recovery_2pc_tools.source @@ -617,6 +617,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c ss_enable_catalog_centralized | bool | | | ss_enable_dms | bool | | | ss_enable_dss | bool | | | + ss_enable_ondemand_recovery | bool | | | ss_enable_reform | bool | | | ss_enable_scrlock | bool | | | ss_enable_scrlock_sleep_mode | bool | | | @@ -638,6 +639,7 @@ select name,vartype,unit,min_val,max_val from pg_settings where name <> 'qunit_c ss_log_max_file_size | integer | kB | 1024 | 4194304 ssl_renegotiation_limit | integer | kB | 0 | 2147483647 ss_ock_log_path | string | | | + ss_ondemand_recovery_mem_size | integer | kB | 1048576 | 104857600 ss_parallel_thread_count | integer | | 0 | 64 ss_rdma_work_config | string | | | ss_recv_msg_pool_size | integer | kB | 1024 | 1048576 diff --git a/src/test/regress/pg_regress.cpp b/src/test/regress/pg_regress.cpp index 21adfa42300c37800aaf20ed23eb36c35d8df6bf..58510063cd863b612d3157327e3efedb05400d29 100644 --- a/src/test/regress/pg_regress.cpp +++ b/src/test/regress/pg_regress.cpp @@ -5412,7 +5412,7 @@ static void CheckCleanCodeWarningInfo(const int baseNum, const int currentNum, return; } -#define BASE_GLOBAL_VARIABLE_NUM 222 +#define BASE_GLOBAL_VARIABLE_NUM 224 #define CMAKE_CMD_BUF_LEN 1000 @@ -5461,7 +5461,7 @@ static void check_global_variables() } } -#define BASE_PGXC_LIKE_MACRO_NUM 1391 +#define BASE_PGXC_LIKE_MACRO_NUM 1392 static void check_pgxc_like_macros() { #ifdef BUILD_BY_CMAKE