From 507aa793a0e0c3381a9c013ca3b331c305e3267d Mon Sep 17 00:00:00 2001 From: chenxiaobin19 <1025221611@qq.com> Date: Wed, 8 Jun 2022 11:40:08 +0800 Subject: [PATCH] publication and subscription support for initial data copy --- doc/src/sgml/ref/alter_subscription.sgmlin | 48 + doc/src/sgml/ref/create_subscription.sgmlin | 10 + src/bin/gs_guc/cluster_guc.conf | 1 + src/common/backend/catalog/CMakeLists.txt | 2 +- src/common/backend/catalog/Makefile | 1 + src/common/backend/catalog/builtin_funcs.ini | 6 +- src/common/backend/catalog/heap.cpp | 6 + src/common/backend/catalog/pg_enum.cpp | 14 - src/common/backend/catalog/pg_inherits.cpp | 16 +- .../backend/catalog/pg_subscription.cpp | 299 +++++ src/common/backend/catalog/system_views.sql | 1 + src/common/backend/nodes/copyfuncs.cpp | 3 + src/common/backend/nodes/equalfuncs.cpp | 3 + src/common/backend/parser/gram.y | 13 + src/common/backend/utils/adt/acl.cpp | 19 +- src/common/backend/utils/adt/misc.cpp | 19 + src/common/backend/utils/adt/oid.cpp | 13 + src/common/backend/utils/cache/relcache.cpp | 11 + src/common/backend/utils/cache/syscache.cpp | 11 +- src/common/backend/utils/init/globals.cpp | 3 +- .../backend/utils/misc/guc/guc_storage.cpp | 14 + .../utils/misc/postgresql_single.conf.sample | 1 + src/common/backend/utils/time/snapmgr.cpp | 106 +- src/gausskernel/optimizer/commands/async.cpp | 2 - src/gausskernel/optimizer/commands/copy.cpp | 10 +- .../optimizer/commands/subscriptioncmds.cpp | 515 ++++++++- src/gausskernel/process/postmaster/pgstat.cpp | 17 +- src/gausskernel/process/tcop/utility.cpp | 2 +- .../process/threadpool/knl_thread.cpp | 1 + src/gausskernel/storage/ipc/pmsignal.cpp | 7 +- src/gausskernel/storage/ipc/procarray.cpp | 11 +- src/gausskernel/storage/lmgr/predicate.cpp | 22 +- .../storage/replication/libpqwalreceiver.cpp | 190 ++- .../storage/replication/logical/Makefile | 2 +- .../storage/replication/logical/launcher.cpp | 96 +- .../storage/replication/logical/origin.cpp | 77 +- .../storage/replication/logical/proto.cpp | 6 + .../storage/replication/logical/relation.cpp | 6 + .../storage/replication/logical/snapbuild.cpp | 114 +- .../storage/replication/logical/tablesync.cpp | 1018 +++++++++++++++++ .../storage/replication/logical/worker.cpp | 156 ++- .../storage/replication/repl_gram.y | 30 +- .../storage/replication/repl_scanner.l | 5 +- src/gausskernel/storage/replication/slot.cpp | 6 +- .../replication/subscription_walreceiver.cpp | 5 +- .../storage/replication/walreceiver.cpp | 2 +- .../storage/replication/walsender.cpp | 108 +- src/include/catalog/catversion.h | 2 +- src/include/catalog/indexing.h | 2 + src/include/catalog/pg_subscription_rel.h | 81 ++ .../rollback_catalog_maindb_92_607.sql | 37 + .../rollback_catalog_otherdb_92_607.sql | 37 + .../upgrade-post_catalog_maindb_92_607.sql | 26 + .../upgrade_catalog_maindb_92_607.sql | 31 + .../upgrade-post_catalog_otherdb_92_607.sql | 26 + .../upgrade_catalog_otherdb_92_607.sql | 31 + src/include/commands/copy.h | 4 +- src/include/commands/subscriptioncmds.h | 2 +- .../knl/knl_guc/knl_instance_attr_storage.h | 1 + src/include/knl/knl_session.h | 2 +- src/include/knl/knl_thread.h | 6 + src/include/miscadmin.h | 3 + src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes_common.h | 1 + src/include/nodes/replnodes.h | 10 + src/include/pgstat.h | 3 + src/include/replication/libpqwalreceiver.h | 34 +- src/include/replication/logicalproto.h | 1 + src/include/replication/origin.h | 8 +- src/include/replication/slot.h | 3 +- src/include/replication/snapbuild.h | 1 + .../replication/subscription_walreceiver.h | 2 +- src/include/replication/walreceiver.h | 21 +- src/include/replication/worker_internal.h | 21 +- src/include/storage/predicate.h | 2 +- src/include/storage/procarray.h | 2 +- src/include/utils/builtins.h | 1 + src/include/utils/snapmgr.h | 2 + src/include/utils/syscache.h | 2 +- src/test/ha/pubsub_check.sh | 73 ++ src/test/regress/input/publication.source | 4 + src/test/regress/input/subscription.source | 3 + src/test/regress/output/publication.source | 9 + src/test/regress/output/subscription.source | 5 + src/tools/pgindent/typedefs.list | 1 + 85 files changed, 3218 insertions(+), 311 deletions(-) create mode 100644 src/gausskernel/storage/replication/logical/tablesync.cpp create mode 100644 src/include/catalog/pg_subscription_rel.h create mode 100644 src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_607.sql create mode 100644 src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_607.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_607.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_607.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_607.sql create mode 100644 src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_607.sql create mode 100644 src/test/ha/pubsub_check.sh diff --git a/doc/src/sgml/ref/alter_subscription.sgmlin b/doc/src/sgml/ref/alter_subscription.sgmlin index d1f2dc5018..2c4e4d168a 100644 --- a/doc/src/sgml/ref/alter_subscription.sgmlin +++ b/doc/src/sgml/ref/alter_subscription.sgmlin @@ -23,6 +23,7 @@ PostgreSQL documentation ALTER SUBSCRIPTION name CONNECTION 'conninfo' ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] +ALTER SUBSCRIPTION name REFRESH PUBLICATION [ WITH ( refresh_option [= value] [, ... ] ) ] ALTER SUBSCRIPTION name ENABLE ALTER SUBSCRIPTION name SET ( subscription_parameter [= value] [, ... ] ) ALTER SUBSCRIPTION name OWNER TO new_owner @@ -46,6 +47,23 @@ ALTER SUBSCRIPTION name RENAME TO < (Currently, all subscription owners must be superusers, so the owner checks will be bypassed in practice. But this might change in the future.) + + + When refreshing a publication we remove the relations that are no longer + part of the publication and we also remove the table synchronization slots + if there are any. It is necessary to remove these slots so that the resources + allocated for the subscription on the remote host are released. If due to + network breakdown or some other error, openGauss + is unable to remove the slots, an ERROR will be reported. To proceed in this + situation, the user either needs to retry the operation or disassociate the + slot from the subscription and drop the subscription as explained in + . + + + + Commands ALTER SUBSCRIPTION ... REFRESH PUBLICATION + cannot be executed inside a transaction block. + @@ -76,6 +94,36 @@ ALTER SUBSCRIPTION name RENAME TO < SET PUBLICATION publication_name + + REFRESH PUBLICATION + + + Fetch missing table information from publisher. This will start + replication of tables that were added to the subscribed-to publications + since the last invocation of REFRESH PUBLICATION or + since CREATE SUBSCRIPTION. + + + + refresh_option specifies additional options for the + refresh operation. The supported options are: + + + + copy_data (boolean) + + + Specifies whether the existing data in the publications that are + being subscribed to should be copied once the replication starts. + The default is true. (Previously subscribed + tables are not copied.) + + + + + + + ENABLE diff --git a/doc/src/sgml/ref/create_subscription.sgmlin b/doc/src/sgml/ref/create_subscription.sgmlin index f1be6ed08d..ca5f72a0dc 100644 --- a/doc/src/sgml/ref/create_subscription.sgmlin +++ b/doc/src/sgml/ref/create_subscription.sgmlin @@ -96,6 +96,16 @@ CREATE SUBSCRIPTION subscription_name + + copy_data (boolean) + + + Specifies whether the existing data in the publications that are + being subscribed to should be copied once the replication starts. + The default is true. + + + enabled (boolean) diff --git a/src/bin/gs_guc/cluster_guc.conf b/src/bin/gs_guc/cluster_guc.conf index a19a76d861..d23e9b7bdf 100755 --- a/src/bin/gs_guc/cluster_guc.conf +++ b/src/bin/gs_guc/cluster_guc.conf @@ -658,6 +658,7 @@ cost_weight_index|real|1e-10,1e+10|NULL|NULL| default_limit_rows|real|-100,1.79769e+308|NULL|NULL| sql_beta_feature|enum|partition_fdw_on,partition_opfusion,index_cost_with_leaf_pages_only,canonical_pathkey,join_sel_with_cast_func,no_unique_index_first,sel_semi_poisson,sel_expr_instr,param_path_gen,rand_cost_opt,param_path_opt,page_est_opt,a_style_coerce,predpush_same_level,none|NULL|NULL| max_logical_replication_workers|int|0,262143|NULL|Maximum number of logical replication worker processes.| +max_sync_workers_per_subscription|int|0,262143|NULL|Maximum number of table synchronization workers per subscription.| walwriter_sleep_threshold|int64|1,50000|NULL|NULL| walwriter_cpu_bind|int|-1,2147483647|NULL|NULL| wal_file_init_num|int|0,1000000|NULL|NULL| diff --git a/src/common/backend/catalog/CMakeLists.txt b/src/common/backend/catalog/CMakeLists.txt index 3a6ae4992a..bb6951147a 100755 --- a/src/common/backend/catalog/CMakeLists.txt +++ b/src/common/backend/catalog/CMakeLists.txt @@ -15,7 +15,7 @@ set(POSTGRES_BKI_SRCS_S @pg_object.h @pg_synonym.h @toasting.h @indexing.h @gs_obsscaninfo.h @pg_directory.h @pg_hashbucket.h @gs_global_chain.h @gs_global_config.h @pg_streaming_stream.h @pg_streaming_cont_query.h @pg_streaming_reaper_status.h @gs_matview.h @gs_matview_dependency.h @pgxc_slice.h @gs_opt_model.h @pg_recyclebin.h @pg_snapshot.h @gs_model.h @gs_package.h @gs_job_argument.h @gs_job_attribute.h @pg_uid.h @gs_db_privilege.h -@pg_replication_origin.h @pg_publication.h @pg_publication_rel.h @pg_subscription.h" +@pg_replication_origin.h @pg_publication.h @pg_publication_rel.h @pg_subscription.h @pg_subscription_rel.h" ) diff --git a/src/common/backend/catalog/Makefile b/src/common/backend/catalog/Makefile index 71eb2548da..03bfa2c451 100644 --- a/src/common/backend/catalog/Makefile +++ b/src/common/backend/catalog/Makefile @@ -61,6 +61,7 @@ POSTGRES_BKI_SRCS = $(addprefix $(top_srcdir)/src/include/catalog/,\ gs_matview_dependency.h pgxc_slice.h gs_opt_model.h gs_model.h\ pg_recyclebin.h pg_snapshot.h gs_job_argument.h gs_job_attribute.h pg_uid.h gs_db_privilege.h\ pg_replication_origin.h pg_publication.h pg_publication_rel.h pg_subscription.h\ + pg_subscription_rel.h \ ) # location of Catalog.pm diff --git a/src/common/backend/catalog/builtin_funcs.ini b/src/common/backend/catalog/builtin_funcs.ini index d489c8b2e9..22c0fa2933 100755 --- a/src/common/backend/catalog/builtin_funcs.ini +++ b/src/common/backend/catalog/builtin_funcs.ini @@ -7868,6 +7868,10 @@ "pg_get_replication_slots", 1, AddBuiltinFunc(_0(3784), _1("pg_get_replication_slots"), _2(0), _3(false), _4(true), _5(pg_get_replication_slots), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(10), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('s'), _19(0), _20(0), _21(9, 25, 25, 25, 26, 16, 28, 28, 25, 16), _22(9, 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'), _23(9, "slot_name", "plugin", "slot_type", "datoid", "active", "xmin", "catalog_xmin", "restart_lsn", "dummy_standby"), _24(NULL), _25("pg_get_replication_slots"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("information about replication slots currently in use"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) ), + AddFuncGroup( + "pg_get_replica_identity_index", 1, + AddBuiltinFunc(_0(6120), _1("pg_get_replica_identity_index"), _2(1), _3(true), _4(false), _5(pg_get_replica_identity_index), _6(2205), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(10), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('s'), _19(0), _20(1, 2205), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_get_replica_identity_index"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("oid of replica identity index if any"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) + ), AddFuncGroup( "pg_get_ruledef", 2, AddBuiltinFunc(_0(1573), _1("pg_get_ruledef"), _2(1), _3(true), _4(false), _5(pg_get_ruledef), _6(25), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('s'), _19(0), _20(1, 26), _21(NULL), _22(NULL), _23(NULL), _24(NULL), _25("pg_get_ruledef"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("source text of a rule"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)), @@ -8597,7 +8601,7 @@ ), AddFuncGroup( "pg_stat_get_subscription", 1, - AddBuiltinFunc(_0(2802), _1("pg_stat_get_subscription"), _2(1), _3(false), _4(false), _5(pg_stat_get_subscription), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('s'), _19(0), _20(1, 26), _21(8, 26, 26, 23, 25, 1184, 1184, 25, 1184), _22(8, 'i', 'o', 'o', 'o', 'o', 'o', 'o', 'o'), _23(8, "subid", "subid", "pid", "received_lsn", "last_msg_send_time", "last_msg_receipt_time", "latest_end_lsn", "latest_end_time"), _24(NULL), _25("pg_stat_get_subscription"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("statistics: information about subscription"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) + AddBuiltinFunc(_0(2802), _1("pg_stat_get_subscription"), _2(1), _3(false), _4(false), _5(pg_stat_get_subscription), _6(2249), _7(PG_CATALOG_NAMESPACE), _8(BOOTSTRAP_SUPERUSERID), _9(INTERNALlanguageId), _10(1), _11(0), _12(0), _13(0), _14(false), _15(false), _16(false), _17(false), _18('s'), _19(0), _20(1, 26), _21(9, 26, 26, 26, 23, 25, 1184, 1184, 25, 1184), _22(9, 'i', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o'), _23(9, "subid", "subid", "relid", "pid", "received_lsn", "last_msg_send_time", "last_msg_receipt_time", "latest_end_lsn", "latest_end_time"), _24(NULL), _25("pg_stat_get_subscription"), _26(NULL), _27(NULL), _28(NULL), _29(0), _30(false), _31(NULL), _32(false), _33("statistics: information about subscription"), _34('f'), _35(NULL), _36(0), _37(false), _38(NULL), _39(NULL), _40(0)) ), AddFuncGroup( "pg_stat_get_thread", 1, diff --git a/src/common/backend/catalog/heap.cpp b/src/common/backend/catalog/heap.cpp index 38ecd1b3e6..20f8f25193 100644 --- a/src/common/backend/catalog/heap.cpp +++ b/src/common/backend/catalog/heap.cpp @@ -69,6 +69,7 @@ #include "catalog/storage.h" #include "catalog/storage_xlog.h" #include "catalog/storage_gtt.h" +#include "catalog/pg_subscription_rel.h" #include "commands/matview.h" #include "commands/tablecmds.h" #include "commands/tablespace.h" @@ -3574,6 +3575,11 @@ void heap_drop_with_catalog(Oid relid) } relation_close(rel, NoLock); + /* + * Remove any associated relation synchronization states. + */ + RemoveSubscriptionRel(InvalidOid, relid); + /* * Forget any ON COMMIT action for the rel */ diff --git a/src/common/backend/catalog/pg_enum.cpp b/src/common/backend/catalog/pg_enum.cpp index 73188a7cdf..d54a5f62e3 100644 --- a/src/common/backend/catalog/pg_enum.cpp +++ b/src/common/backend/catalog/pg_enum.cpp @@ -29,7 +29,6 @@ #include "utils/snapmgr.h" static void RenumberEnumType(Relation pg_enum, HeapTuple* existing, int nelems); -static int oid_cmp(const void* p1, const void* p2); static int sort_order_cmp(const void* p1, const void* p2); #define checkEnumLableValue(val) \ @@ -556,19 +555,6 @@ static void RenumberEnumType(Relation pg_enum, HeapTuple* existing, int nelems) CommandCounterIncrement(); } -/* qsort comparison function for oids */ -static int oid_cmp(const void* p1, const void* p2) -{ - Oid v1 = *((const Oid*)p1); - Oid v2 = *((const Oid*)p2); - - if (v1 < v2) - return -1; - if (v1 > v2) - return 1; - return 0; -} - /* qsort comparison function for tuples by sort order */ static int sort_order_cmp(const void* p1, const void* p2) { diff --git a/src/common/backend/catalog/pg_inherits.cpp b/src/common/backend/catalog/pg_inherits.cpp index 81742cefec..0f6458e06d 100644 --- a/src/common/backend/catalog/pg_inherits.cpp +++ b/src/common/backend/catalog/pg_inherits.cpp @@ -27,12 +27,11 @@ #include "catalog/pg_inherits_fn.h" #include "parser/parse_type.h" #include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/fmgroids.h" #include "utils/syscache.h" #include "utils/snapmgr.h" -static int oid_cmp(const void* p1, const void* p2); - /* * find_inheritance_children * @@ -327,16 +326,3 @@ bool typeInheritsFrom(Oid subclassTypeId, Oid superclassTypeId) return result; } - -/* qsort comparison function */ -static int oid_cmp(const void* p1, const void* p2) -{ - Oid v1 = *((const Oid*)p1); - Oid v2 = *((const Oid*)p2); - - if (v1 < v2) - return -1; - if (v1 > v2) - return 1; - return 0; -} diff --git a/src/common/backend/catalog/pg_subscription.cpp b/src/common/backend/catalog/pg_subscription.cpp index 2db299a751..86b7dea51a 100644 --- a/src/common/backend/catalog/pg_subscription.cpp +++ b/src/common/backend/catalog/pg_subscription.cpp @@ -18,16 +18,22 @@ #include "access/genam.h" #include "access/heapam.h" #include "access/htup.h" +#include "access/tableam.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" +#include "storage/lmgr.h" #include "nodes/makefuncs.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/fmgroids.h" +#include "utils/pg_lsn.h" #include "utils/syscache.h" +#include "catalog/indexing.h" +#include "utils/snapmgr.h" static List *textarray_to_stringlist(ArrayType *textarray); @@ -230,3 +236,296 @@ static List *textarray_to_stringlist(ArrayType *textarray) return res; } + +static Datum LsnGetTextDatum(XLogRecPtr lsn) +{ + char clsn[MAXFNAMELEN]; + int ret = snprintf_s(clsn, sizeof(clsn), sizeof(clsn) - 1, "%X/%X", (uint32)(lsn >> 32), (uint32)lsn); + securec_check_ss(ret, "\0", "\0"); + + return CStringGetTextDatum(clsn); +} + +static XLogRecPtr TextDatumGetLsn(Datum datum) +{ + XLogRecPtr lsn; + uint32 lsn_hi; + uint32 lsn_lo; + char* clsn = TextDatumGetCString(datum); + int ret = sscanf_s(clsn, "%X/%X", &lsn_hi, &lsn_lo); + securec_check_for_sscanf_s(ret, 2, "\0", "\0"); + /* Calculate LSN */ + lsn = ((uint64) lsn_hi )<< 32 | lsn_lo; + + return lsn; +} + +/* + * Add new state record for a subscription table. + */ +Oid AddSubscriptionRelState(Oid subid, Oid relid, char state) +{ + Relation rel; + HeapTuple tup; + Oid subrelid; + bool nulls[Natts_pg_subscription_rel]; + Datum values[Natts_pg_subscription_rel]; + int rc; + + /* Prevent concurrent changes. */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + /* Try finding existing mapping. */ + tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, ObjectIdGetDatum(relid), ObjectIdGetDatum(subid)); + + if (HeapTupleIsValid(tup)) + elog(ERROR, "subscription table %u in subscription %u already exists", relid, subid); + + /* Form the tuple. */ + rc = memset_s(values, sizeof(values), 0, sizeof(values)); + securec_check(rc, "", ""); + rc = memset_s(nulls, sizeof(nulls), false, sizeof(nulls)); + securec_check(rc, "", ""); + values[Anum_pg_subscription_rel_srsubid - 1] = ObjectIdGetDatum(subid); + values[Anum_pg_subscription_rel_srrelid - 1] = ObjectIdGetDatum(relid); + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + values[Anum_pg_subscription_rel_srcsn - 1] = UInt64GetDatum(InvalidCommitSeqNo); + values[Anum_pg_subscription_rel_srsublsn - 1] = LsnGetTextDatum(InvalidXLogRecPtr); + + tup = heap_form_tuple(RelationGetDescr(rel), values, nulls); + + /* Insert tuple into catalog. */ + subrelid = simple_heap_insert(rel, tup); + CatalogUpdateIndexes(rel, tup); + tableam_tops_free_tuple(tup); + /* Cleanup. */ + heap_close(rel, NoLock); + + return subrelid; +} + +/* + * Update the state of a subscription table. + */ +Oid UpdateSubscriptionRelState(Oid subid, Oid relid, char state, XLogRecPtr sublsn, CommitSeqNo subcsn) +{ + Relation rel; + HeapTuple tup; + Oid subrelid; + bool nulls[Natts_pg_subscription_rel]; + Datum values[Natts_pg_subscription_rel]; + bool replaces[Natts_pg_subscription_rel]; + int rc; + + LockSharedObject(SubscriptionRelationId, subid, 0, AccessShareLock); + + rel = heap_open(SubscriptionRelRelationId, RowExclusiveLock); + + /* Try finding existing mapping. */ + tup = SearchSysCacheCopy2(SUBSCRIPTIONRELMAP, ObjectIdGetDatum(relid), ObjectIdGetDatum(subid)); + if (!HeapTupleIsValid(tup)) + elog(ERROR, "subscription table %u in subscription %u does not exist", relid, subid); + + /* Update the tuple. */ + rc = memset_s(values, sizeof(values), 0, sizeof(values)); + securec_check(rc, "", ""); + rc = memset_s(nulls, sizeof(nulls), false, sizeof(nulls)); + securec_check(rc, "", ""); + rc = memset_s(replaces, sizeof(replaces), false, sizeof(replaces)); + securec_check(rc, "", ""); + + replaces[Anum_pg_subscription_rel_srsubstate - 1] = true; + values[Anum_pg_subscription_rel_srsubstate - 1] = CharGetDatum(state); + + if (subcsn != InvalidCommitSeqNo) { + replaces[Anum_pg_subscription_rel_srcsn - 1] = true; + values[Anum_pg_subscription_rel_srcsn - 1] = Int64GetDatum(subcsn); + } + + replaces[Anum_pg_subscription_rel_srsublsn - 1] = true; + if (sublsn != InvalidXLogRecPtr) + values[Anum_pg_subscription_rel_srsublsn - 1] = LsnGetTextDatum(sublsn); + else + nulls[Anum_pg_subscription_rel_srsublsn - 1] = true; + + tup = (HeapTuple)tableam_tops_modify_tuple(tup, RelationGetDescr(rel), values, nulls, replaces); + + /* Update the catalog. */ + simple_heap_update(rel, &tup->t_self, tup); + CatalogUpdateIndexes(rel, tup); + + subrelid = HeapTupleGetOid(tup); + tableam_tops_free_tuple(tup); + + /* Cleanup. */ + heap_close(rel, NoLock); + + return subrelid; +} + +/* + * Get state of subscription table. + * + * Returns SUBREL_STATE_UNKNOWN when the table is not in the subscription. + */ +char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn, CommitSeqNo *subcsn) +{ + HeapTuple tup; + char substate; + bool isnull; + Datum d; + Relation rel; + + /* + * This is to avoid the race condition with AlterSubscription which tries + * to remove this relstate. + */ + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + /* Try finding the mapping. */ + tup = SearchSysCache2(SUBSCRIPTIONRELMAP, ObjectIdGetDatum(relid), ObjectIdGetDatum(subid)); + + if (!HeapTupleIsValid(tup)) { + heap_close(rel, AccessShareLock); + *sublsn = InvalidXLogRecPtr; + return SUBREL_STATE_UNKNOWN; + } + + /* Get the state. */ + substate = ((Form_pg_subscription_rel)GETSTRUCT(tup))->srsubstate; + + /* Get the LSN */ + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, Anum_pg_subscription_rel_srsublsn, &isnull); + if (isnull) + *sublsn = InvalidXLogRecPtr; + else + *sublsn = TextDatumGetLsn(d); + + if (subcsn) { + /* Get the Csn */ + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, Anum_pg_subscription_rel_srcsn, &isnull); + if (isnull) + *subcsn = InvalidCommitSeqNo; + else + *subcsn = DatumGetInt64(d); + } + + /* Cleanup */ + ReleaseSysCache(tup); + + heap_close(rel, AccessShareLock); + + return substate; +} + +/* + * Drop subscription relation mapping. These can be for a particular + * subscription, or for a particular relation, or both. + */ +void RemoveSubscriptionRel(Oid subid, Oid relid) +{ + Relation rel; + TableScanDesc scan; + ScanKeyData skey[2]; + HeapTuple tup; + int nkeys = 0; + + /* Prevent concurrent changes (see SetSubscriptionRelState()). */ + rel = heap_open(SubscriptionRelRelationId, ShareRowExclusiveLock); + + if (OidIsValid(subid)) { + ScanKeyInit(&skey[nkeys++], Anum_pg_subscription_rel_srsubid, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + } + + if (OidIsValid(relid)) { + ScanKeyInit(&skey[nkeys++], Anum_pg_subscription_rel_srrelid, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relid)); + } + + /* Do the search and delete what we found. */ + scan = tableam_scan_begin(rel, SnapshotNow, nkeys, skey); + while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection))) { + Form_pg_subscription_rel subrel = (Form_pg_subscription_rel)GETSTRUCT(tup); + + /* + * We don't allow to drop the relation mapping when the table + * synchronization is in progress unless the caller updates the + * corresponding subscription as well. This is to ensure that we don't + * leave tablesync slots or origins in the system when the + * corresponding table is dropped. + */ + if (!OidIsValid(subid) && subrel->srsubstate != SUBREL_STATE_READY) { + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("could not drop relation mapping for subscription \"%s\"", + get_subscription_name(subrel->srsubid, false)), + errdetail("Table synchronization for relation \"%s\" is in progress and is in state \"%c\".", + get_rel_name(relid), subrel->srsubstate), + /* + * translator: first %s is a SQL ALTER command and second %s is a + * SQL DROP command + */ + errhint("Use %s to enable subscription if not already enabled or use %s to drop the subscription.", + "ALTER SUBSCRIPTION ... ENABLE", "DROP SUBSCRIPTION ..."))); + } + + simple_heap_delete(rel, &tup->t_self); + } + heap_endscan(scan); + + heap_close(rel, ShareRowExclusiveLock); +} + +/* + * Get all relations for subscription, or get that are + * not in a ready status if needNotReady is true. + * + * Returned list is palloced in current memory context. + */ +List *GetSubscriptionRelations(Oid subid, bool needNotReady) +{ + List *res = NIL; + Relation rel; + HeapTuple tup; + int nkeys = 0; + ScanKeyData skey[2]; + SysScanDesc scan; + + rel = heap_open(SubscriptionRelRelationId, AccessShareLock); + + ScanKeyInit(&skey[nkeys++], Anum_pg_subscription_rel_srsubid, BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(subid)); + + if (needNotReady) { + ScanKeyInit(&skey[nkeys++], Anum_pg_subscription_rel_srsubstate, BTEqualStrategyNumber, F_CHARNE, + CharGetDatum(SUBREL_STATE_READY)); + } + + scan = systable_beginscan(rel, InvalidOid, false, NULL, nkeys, skey); + + while (HeapTupleIsValid(tup = systable_getnext(scan))) { + Form_pg_subscription_rel subrel; + SubscriptionRelState *relstate; + Datum d; + bool isnull; + + subrel = (Form_pg_subscription_rel)GETSTRUCT(tup); + + relstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + relstate->relid = subrel->srrelid; + relstate->state = subrel->srsubstate; + d = SysCacheGetAttr(SUBSCRIPTIONRELMAP, tup, Anum_pg_subscription_rel_srsublsn, &isnull); + if (isnull) + relstate->lsn = InvalidXLogRecPtr; + else + relstate->lsn = TextDatumGetLsn(d); + + res = lappend(res, relstate); + } + + /* Cleanup */ + systable_endscan(scan); + heap_close(rel, AccessShareLock); + + return res; +} diff --git a/src/common/backend/catalog/system_views.sql b/src/common/backend/catalog/system_views.sql index 058724cab6..ab9c077c9b 100644 --- a/src/common/backend/catalog/system_views.sql +++ b/src/common/backend/catalog/system_views.sql @@ -3552,6 +3552,7 @@ CREATE VIEW pg_stat_subscription AS su.oid AS subid, su.subname, st.pid, + st.relid, st.received_lsn, st.last_msg_send_time, st.last_msg_receipt_time, diff --git a/src/common/backend/nodes/copyfuncs.cpp b/src/common/backend/nodes/copyfuncs.cpp index 1271749d91..b9ce9cf619 100644 --- a/src/common/backend/nodes/copyfuncs.cpp +++ b/src/common/backend/nodes/copyfuncs.cpp @@ -6308,6 +6308,9 @@ static AlterSubscriptionStmt *_copyAlterSubscriptionStmt(const AlterSubscription COPY_STRING_FIELD(subname); COPY_NODE_FIELD(options); + if (t_thrd.proc->workingVersionNum >= PUBLICATION_INITIAL_DATA_VERSION_NAME) { + COPY_SCALAR_FIELD(refresh); + } return newnode; } diff --git a/src/common/backend/nodes/equalfuncs.cpp b/src/common/backend/nodes/equalfuncs.cpp index ba789f587c..97bbe18d25 100644 --- a/src/common/backend/nodes/equalfuncs.cpp +++ b/src/common/backend/nodes/equalfuncs.cpp @@ -2933,6 +2933,9 @@ static bool _equalAlterSubscriptionStmt(const AlterSubscriptionStmt *a, const Al { COMPARE_STRING_FIELD(subname); COMPARE_NODE_FIELD(options); + if (t_thrd.proc->workingVersionNum >= PUBLICATION_INITIAL_DATA_VERSION_NAME) { + COMPARE_SCALAR_FIELD(refresh); + } return true; } diff --git a/src/common/backend/parser/gram.y b/src/common/backend/parser/gram.y index 9d73d43e26..6cc564aa53 100644 --- a/src/common/backend/parser/gram.y +++ b/src/common/backend/parser/gram.y @@ -14771,6 +14771,7 @@ AlterSubscriptionStmt: AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); n->subname = $3; + n->refresh = false; n->options = $5; $$ = (Node *)n; } @@ -14779,6 +14780,7 @@ AlterSubscriptionStmt: AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); n->subname = $3; + n->refresh = false; n->options = list_make1(makeDefElem("conninfo", (Node *)makeString($5))); $$ = (Node *)n; @@ -14788,14 +14790,25 @@ AlterSubscriptionStmt: AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); n->subname = $3; + n->refresh = false; n->options = list_make1(makeDefElem("publication", (Node *)$6)); $$ = (Node *)n; } + | ALTER SUBSCRIPTION name REFRESH PUBLICATION opt_definition + { + AlterSubscriptionStmt *n = + makeNode(AlterSubscriptionStmt); + n->subname = $3; + n->refresh = true; + n->options = $6; + $$ = (Node *)n; + } | ALTER SUBSCRIPTION name ENABLE_P { AlterSubscriptionStmt *n = makeNode(AlterSubscriptionStmt); + n->refresh = false; n->subname = $3; n->options = list_make1(makeDefElem("enabled", (Node *)makeInteger(TRUE))); diff --git a/src/common/backend/utils/adt/acl.cpp b/src/common/backend/utils/adt/acl.cpp index 6a30f0ed28..5d8b94e6fd 100644 --- a/src/common/backend/utils/adt/acl.cpp +++ b/src/common/backend/utils/adt/acl.cpp @@ -47,7 +47,6 @@ static bool aclitem_match(const AclItem* a1, const AclItem* a2); static int aclitemComparator(const void* arg1, const void* arg2); static void check_circularity(const Acl* old_acl, const AclItem* mod_aip, Oid ownerId); static Acl* recursive_revoke(Acl* acl, Oid grantee, AclMode revoke_privs, Oid ownerId, DropBehavior behavior); -static int oidComparator(const void* arg1, const void* arg2); static AclMode convert_priv_string(text* priv_type_text); static AclMode convert_any_priv_string(text* priv_type_text, const priv_map* privileges); @@ -1512,7 +1511,7 @@ int aclmembers(const Acl* acl, Oid** roleids) } /* Sort the array */ - qsort(list, j, sizeof(Oid), oidComparator); + qsort(list, j, sizeof(Oid), oid_cmp); /* Remove duplicates from the array */ k = 0; @@ -1530,22 +1529,6 @@ int aclmembers(const Acl* acl, Oid** roleids) return k + 1; } -/* - * oidComparator - * qsort comparison function for Oids - */ -static int oidComparator(const void* arg1, const void* arg2) -{ - Oid oid1 = *(const Oid*)arg1; - Oid oid2 = *(const Oid*)arg2; - - if (oid1 > oid2) - return 1; - if (oid1 < oid2) - return -1; - return 0; -} - /* * aclinsert (exported function) */ diff --git a/src/common/backend/utils/adt/misc.cpp b/src/common/backend/utils/adt/misc.cpp index a544ca3e2b..f005507a19 100644 --- a/src/common/backend/utils/adt/misc.cpp +++ b/src/common/backend/utils/adt/misc.cpp @@ -1006,3 +1006,22 @@ void cancel_backend(ThreadId pid) errhint("fail to cancel backend process for privilege"))); } } + +/* + * SQL wrapper around RelationGetReplicaIndex(). + */ +Datum pg_get_replica_identity_index(PG_FUNCTION_ARGS) +{ + Oid reloid = PG_GETARG_OID(0); + Oid idxoid; + Relation rel; + + rel = heap_open(reloid, AccessShareLock); + idxoid = RelationGetReplicaIndex(rel); + heap_close(rel, AccessShareLock); + + if (OidIsValid(idxoid)) + PG_RETURN_OID(idxoid); + else + PG_RETURN_NULL(); +} diff --git a/src/common/backend/utils/adt/oid.cpp b/src/common/backend/utils/adt/oid.cpp index 5473580396..86bc797f91 100644 --- a/src/common/backend/utils/adt/oid.cpp +++ b/src/common/backend/utils/adt/oid.cpp @@ -322,6 +322,19 @@ Oid oidparse(Node* node) return InvalidOid; /* keep compiler quiet */ } +/* qsort comparison function for Oids */ +int oid_cmp(const void *p1, const void *p2) +{ + Oid v1 = *((const Oid *)p1); + Oid v2 = *((const Oid *)p2); + + if (v1 < v2) + return -1; + if (v1 > v2) + return 1; + return 0; +} + /***************************************************************************** * PUBLIC ROUTINES * *****************************************************************************/ diff --git a/src/common/backend/utils/cache/relcache.cpp b/src/common/backend/utils/cache/relcache.cpp index f44b182b07..92b98b6139 100644 --- a/src/common/backend/utils/cache/relcache.cpp +++ b/src/common/backend/utils/cache/relcache.cpp @@ -92,6 +92,7 @@ #include "catalog/gs_package.h" #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_range.h" #include "catalog/pg_recyclebin.h" #include "catalog/pg_replication_origin.h" @@ -335,6 +336,7 @@ static const FormData_pg_attribute Desc_pg_publication_rel[Natts_pg_publication_ static const FormData_pg_attribute Desc_pg_replication_origin[Natts_pg_replication_origin] = { Schema_pg_replication_origin }; +static const FormData_pg_attribute Desc_pg_subscription_rel[Natts_pg_subscription_rel] = {Schema_pg_subscription_rel}; /* Please add to the array in ascending order of oid value */ static struct CatalogRelationBuildParam catalogBuildParam[CATALOG_NUM] = {{DefaultAclRelationId, @@ -901,6 +903,15 @@ static struct CatalogRelationBuildParam catalogBuildParam[CATALOG_NUM] = {{Defau Desc_pg_replication_origin, false, true}, + {SubscriptionRelRelationId, + "pg_subscription_rel", + SubscriptionRelRelation_Rowtype_Id, + false, + false, + Natts_pg_subscription_rel, + Desc_pg_subscription_rel, + false, + true}, {PackageRelationId, "gs_package", PackageRelation_Rowtype_Id, diff --git a/src/common/backend/utils/cache/syscache.cpp b/src/common/backend/utils/cache/syscache.cpp index f6b954fc2e..24b3af7b3d 100644 --- a/src/common/backend/utils/cache/syscache.cpp +++ b/src/common/backend/utils/cache/syscache.cpp @@ -108,6 +108,7 @@ #include "catalog/pg_publication.h" #include "catalog/pg_publication_rel.h" #include "catalog/pg_replication_origin.h" +#include "catalog/pg_subscription_rel.h" /* --------------------------------------------------------------------------- @@ -305,11 +306,11 @@ const cachedesc cacheinfo[] = { 1, {ObjectIdAttributeNumber, 0, 0, 0}, 32}, - {ModelRelationId, /* DB4AI_MODELOID */ - GsModelOidIndexId, - 1, - {ObjectIdAttributeNumber, 0, 0, 0}, - 256}, + {SubscriptionRelRelationId, /* SUBSCRIPTIONRELMAP */ + SubscriptionRelSrrelidSrsubidIndexId, + 2, + {Anum_pg_subscription_rel_srrelid, Anum_pg_subscription_rel_srsubid, 0, 0}, + 64}, {ModelRelationId, /* DB4AI_MODEL */ GsModelNameIndexId, 1, diff --git a/src/common/backend/utils/init/globals.cpp b/src/common/backend/utils/init/globals.cpp index a147cdfc8a..6fa314a39b 100644 --- a/src/common/backend/utils/init/globals.cpp +++ b/src/common/backend/utils/init/globals.cpp @@ -59,7 +59,7 @@ bool open_join_children = true; bool will_shutdown = false; /* hard-wired binary version number */ -const uint32 GRAND_VERSION_NUM = 92606; +const uint32 GRAND_VERSION_NUM = 92607; const uint32 PREDPUSH_SAME_LEVEL_VERSION_NUM = 92522; const uint32 UPSERT_WHERE_VERSION_NUM = 92514; @@ -102,6 +102,7 @@ const uint32 COMMENT_RECORD_PARAM_VERSION_NUM = 92484; const uint32 SCAN_BATCH_MODE_VERSION_NUM = 92568; const uint32 PUBLICATION_VERSION_NUM = 92580; const uint32 SUBSCRIPTION_BINARY_VERSION_NUM = 92606; +const uint32 PUBLICATION_INITIAL_DATA_VERSION_NAME = 92607; /* Version number of the guc parameter backend_version added in V500R001C20 */ const uint32 V5R1C20_BACKEND_VERSION_NUM = 92305; diff --git a/src/common/backend/utils/misc/guc/guc_storage.cpp b/src/common/backend/utils/misc/guc/guc_storage.cpp index b53d83820f..d70c9b6f76 100755 --- a/src/common/backend/utils/misc/guc/guc_storage.cpp +++ b/src/common/backend/utils/misc/guc/guc_storage.cpp @@ -2232,6 +2232,20 @@ static void InitStorageConfigureNamesInt() NULL, NULL}, + {{"max_sync_workers_per_subscription", + PGC_SIGHUP, + NODE_SINGLENODE, + REPLICATION, + gettext_noop("Maximum number of table synchronization workers per subscription."), + NULL}, + &g_instance.attr.attr_storage.max_sync_workers_per_subscription, + 2, + 0, + MAX_BACKENDS, + NULL, + NULL, + NULL}, + {{"recovery_time_target", PGC_SIGHUP, NODE_ALL, diff --git a/src/common/backend/utils/misc/postgresql_single.conf.sample b/src/common/backend/utils/misc/postgresql_single.conf.sample index 2033781df3..de84a76486 100644 --- a/src/common/backend/utils/misc/postgresql_single.conf.sample +++ b/src/common/backend/utils/misc/postgresql_single.conf.sample @@ -336,6 +336,7 @@ hot_standby = on # "on" allows queries during recovery #enable_xlog_prune = on # xlog keep for all standbys even through they are not connecting and donnot created replslot. #max_size_for_xlog_prune = 2147483647 # xlog keep for the wal size less than max_xlog_size when the enable_xlog_prune is on #max_logical_replication_workers = 4 # Maximum number of logical replication worker processes. +#max_sync_workers_per_subscription = 2 # Maximum number of table synchronization workers per subscription. #------------------------------------------------------------------------------ # QUERY TUNING diff --git a/src/common/backend/utils/time/snapmgr.cpp b/src/common/backend/utils/time/snapmgr.cpp index 754bc54f32..e9dec7c013 100644 --- a/src/common/backend/utils/time/snapmgr.cpp +++ b/src/common/backend/utils/time/snapmgr.cpp @@ -62,6 +62,7 @@ #ifdef PGXC #include "pgxc/pgxc.h" #endif +#include "storage/sinvaladt.h" SnapshotData CatalogSnapshotData = {SNAPSHOT_MVCC}; /* @@ -82,18 +83,13 @@ static THR_LOCAL bool RegisterStreamSnapshot = false; /* Define pathname of exported-snapshot files */ #define SNAPSHOT_EXPORT_DIR "pg_snapshots" -#define XactExportFilePath(path, xid, num, suffix) \ - { \ - int rc = snprintf_s(path, \ - sizeof(path), \ - sizeof(path) - 1, \ - SNAPSHOT_EXPORT_DIR "/%08X%08X-%d%s", \ - (uint32)((xid) >> 32), \ - (uint32)(xid), \ - (num), \ - (suffix)); \ - securec_check_ss(rc, "", ""); \ - } + +/* Structure holding info about exported snapshot. */ +typedef struct ExportedSnapshot { + char *snapfile; + Snapshot snapshot; +} ExportedSnapshot; + #define MAX_ULONG_LENGTH 22 /* Static variables representing various special snapshot semantics */ @@ -617,7 +613,7 @@ void SnapshotSetCommandId(CommandId curcid) * must take care of all the same considerations as the first-snapshot case * in GetTransactionSnapshot. */ -static void SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) +void SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, int sourcepid) { /* Caller should have checked this already */ Assert(!u_sess->utils_cxt.FirstSnapshotSet); @@ -664,7 +660,7 @@ static void SetTransactionSnapshot(Snapshot sourcesnap, TransactionId sourcexid) */ if (IsolationUsesXactSnapshot()) { if (IsolationIsSerializable()) - SetSerializableTransactionSnapshot(u_sess->utils_cxt.CurrentSnapshot, sourcexid); + SetSerializableTransactionSnapshot(u_sess->utils_cxt.CurrentSnapshot, sourcevxid, sourcepid); /* Make a saved copy */ Assert(!(u_sess->utils_cxt.CurrentSnapshot != NULL && u_sess->utils_cxt.CurrentSnapshot->user_data != NULL)); u_sess->utils_cxt.CurrentSnapshot = CopySnapshot(u_sess->utils_cxt.CurrentSnapshot); @@ -1114,26 +1110,26 @@ void AtEOXact_Snapshot(bool isCommit) * If we exported any snapshots, clean them up. */ if (u_sess->utils_cxt.exportedSnapshots != NIL) { - TransactionId myxid = GetTopTransactionId(); - int i; - char buf[MAXPGPATH]; - + ListCell *lc; /* * Get rid of the files. Unlink failure is only a WARNING because (1) * it's too late to abort the transaction, and (2) leaving a leaked * file around has little real consequence anyway. + * + * We also also need to remove the snapshots from RegisteredSnapshots + * to prevent a warning below. + * + * As with the FirstXactSnapshot, we don't need to free resources of + * the snapshot iself as it will go away with the memory context. */ - for (i = 1; i <= list_length(u_sess->utils_cxt.exportedSnapshots); i++) { - XactExportFilePath(buf, myxid, i, ""); - if (unlink(buf)) - ereport(WARNING, (errmsg("could not unlink file \"%s\": %m", buf))); + foreach(lc, u_sess->utils_cxt.exportedSnapshots) { + ExportedSnapshot *esnap = (ExportedSnapshot *)lfirst(lc); + + if (unlink(esnap->snapfile)) + ereport(WARNING, (errmsg("could not unlink file \"%s\": %m", esnap->snapfile))); } /* - * As with the FirstXactSnapshot, we needn't spend any effort on - * cleaning up the per-snapshot data structures, but we do need to - * adjust the RegisteredSnapshots count to prevent a warning below. - * * Note: you might be thinking "why do we have the exportedSnapshots * list at all? All we need is a counter!". You're right, but we do * it this way in case we ever feel like improving xmin management. @@ -1189,6 +1185,7 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) { TransactionId topXid; TransactionId* children = NULL; + ExportedSnapshot *esnap; int nchildren; int addTopXid; StringInfoData buf; @@ -1197,6 +1194,7 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) MemoryContext oldcxt; char path[MAXPGPATH]; char pathtmp[MAXPGPATH]; + int rc; /* * It's tempting to call RequireTransactionChain here, since it's not very @@ -1213,9 +1211,9 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) */ /* - * This will assign a transaction ID if we do not yet have one. + * Get our transaction ID if there is one, to include in the snapshot. */ - topXid = GetTopTransactionId(); + topXid = GetTopTransactionIdIfAny(); /* * We cannot export a snapshot from a subtransaction because there's no @@ -1233,6 +1231,14 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) */ nchildren = xactGetCommittedChildren(&children); + /* + * Generate file path for the snapshot. We start numbering of snapshots + * inside the transaction from 1. + */ + rc = snprintf_s(path, sizeof(path), sizeof(path) - 1, SNAPSHOT_EXPORT_DIR "/%08X-%08X-%d", + t_thrd.proc->backendId, t_thrd.proc->lxid, list_length(u_sess->utils_cxt.exportedSnapshots) + 1); + securec_check_ss(rc, "", ""); + /* * Copy the snapshot into u_sess->top_transaction_mem_cxt, add it to the * exportedSnapshots list, and mark it pseudo-registered. We do this to @@ -1243,7 +1249,10 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) snapshot = CopySnapshot(snapshot); oldcxt = MemoryContextSwitchTo(u_sess->top_transaction_mem_cxt); - u_sess->utils_cxt.exportedSnapshots = lappend(u_sess->utils_cxt.exportedSnapshots, snapshot); + esnap = (ExportedSnapshot *) palloc(sizeof(ExportedSnapshot)); + esnap->snapfile = pstrdup(path); + esnap->snapshot = snapshot; + u_sess->utils_cxt.exportedSnapshots = lappend(u_sess->utils_cxt.exportedSnapshots, esnap); (void)MemoryContextSwitchTo(oldcxt); snapshot->regd_count++; @@ -1256,7 +1265,8 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) */ initStringInfo(&buf); - appendStringInfo(&buf, "xid:" XID_FMT "\n", topXid); + appendStringInfo(&buf, "vxid: %d/" XID_FMT "\n", t_thrd.proc->backendId, t_thrd.proc->lxid); + appendStringInfo(&buf, "pid:%lu\n", t_thrd.proc_cxt.MyProcPid); appendStringInfo(&buf, "dbid:%u\n", u_sess->proc_cxt.MyDatabaseId); appendStringInfo(&buf, "iso:%d\n", u_sess->utils_cxt.XactIsoLevel); appendStringInfo(&buf, "ro:%d\n", u_sess->attr.attr_common.XactReadOnly); @@ -1277,7 +1287,7 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) * xmax. (We need not make the same check for subxip[] members, see * snapshot.h.) */ - addTopXid = TransactionIdPrecedes(topXid, snapshot->xmax) ? 1 : 0; + addTopXid = (TransactionIdIsValid(topXid) && TransactionIdPrecedes(topXid, snapshot->xmax)) ? 1 : 0; if (addTopXid) appendStringInfo(&buf, "xip:" XID_FMT "\n", topXid); @@ -1306,7 +1316,8 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) * ensures that no other backend can read an incomplete file * (ImportSnapshot won't allow it because of its valid-characters check). */ - XactExportFilePath(pathtmp, topXid, list_length(u_sess->utils_cxt.exportedSnapshots), ".tmp"); + rc = snprintf_s(pathtmp, sizeof(pathtmp), sizeof(pathtmp) - 1, "%s.tmp", path); + securec_check_ss(rc, "", ""); if (!(f = AllocateFile(pathtmp, PG_BINARY_W))) ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", pathtmp))); @@ -1322,8 +1333,6 @@ char* ExportSnapshot(Snapshot snapshot, CommitSeqNo *snapshotCsn) * Now that we have written everything into a .tmp file, rename the file * to remove the .tmp suffix. */ - XactExportFilePath(path, topXid, list_length(u_sess->utils_cxt.exportedSnapshots), ""); - if (rename(pathtmp, path) < 0) ereport( ERROR, (errcode_for_file_access(), errmsg("could not rename file \"%s\" to \"%s\": %m", pathtmp, path))); @@ -1457,6 +1466,25 @@ static GTM_Timeline parseTimelineFromText(const char* prefix, char** s, const ch return val; } +static void parseVxidFromText(const char *prefix, char **s, const char *filename, VirtualTransactionId *vxid) +{ + char *ptr = *s; + int prefixlen = strlen(prefix); + + if (strncmp(ptr, prefix, prefixlen) != 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); + ptr += prefixlen; + if (sscanf_s(ptr, "%d/%u", &vxid->backendId, &vxid->localTransactionId) != 2) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); + ptr = strchr(ptr, '\n'); + if (!ptr) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", filename))); + *s = ptr + 1; +} + /* * ImportSnapshot * Import a previously exported snapshot. The argument should be a @@ -1469,7 +1497,8 @@ void ImportSnapshot(const char* idstr) FILE* f = NULL; struct stat stat_buf; char* filebuf = NULL; - TransactionId src_xid; + VirtualTransactionId src_vxid; + int src_pid; Oid src_dbid; int src_isolevel; bool src_readonly = false; @@ -1531,7 +1560,8 @@ void ImportSnapshot(const char* idstr) rc = memset_s(&snapshot, sizeof(SnapshotData), 0, sizeof(snapshot)); securec_check(rc, "", ""); - src_xid = parseXidFromText("xid:", &filebuf, path); + parseVxidFromText("vxid:", &filebuf, path, &src_vxid); + src_pid = parseIntFromText("pid:", &filebuf, path); /* we abuse parseXidFromText a bit here ... */ src_dbid = parseXidFromText("dbid:", &filebuf, path); src_isolevel = parseIntFromText("iso:", &filebuf, path); @@ -1550,7 +1580,7 @@ void ImportSnapshot(const char* idstr) * don't trouble to check the array elements, just the most critical * fields. */ - if (!TransactionIdIsNormal(src_xid) || !OidIsValid(src_dbid) || !TransactionIdIsNormal(snapshot.xmin) || + if (!VirtualTransactionIdIsValid(src_vxid) || !OidIsValid(src_dbid) || !TransactionIdIsNormal(snapshot.xmin) || !TransactionIdIsNormal(snapshot.xmax)) ereport(ERROR, (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), errmsg("invalid snapshot data in file \"%s\"", path))); @@ -1587,7 +1617,7 @@ void ImportSnapshot(const char* idstr) (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot import a snapshot from a different database"))); /* OK, install the snapshot */ - SetTransactionSnapshot(&snapshot, src_xid); + SetTransactionSnapshot(&snapshot, &src_vxid, src_pid); } /* diff --git a/src/gausskernel/optimizer/commands/async.cpp b/src/gausskernel/optimizer/commands/async.cpp index ee5070a235..64a6314428 100644 --- a/src/gausskernel/optimizer/commands/async.cpp +++ b/src/gausskernel/optimizer/commands/async.cpp @@ -200,8 +200,6 @@ typedef struct QueueBackendStatus { QueuePosition pos; /* backend has read queue up to here */ } QueueBackendStatus; -#define InvalidPid ((ThreadId)(-1)) - /* * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * diff --git a/src/gausskernel/optimizer/commands/copy.cpp b/src/gausskernel/optimizer/commands/copy.cpp index ad6e43840a..6eb6ce8b02 100644 --- a/src/gausskernel/optimizer/commands/copy.cpp +++ b/src/gausskernel/optimizer/commands/copy.cpp @@ -244,7 +244,6 @@ uint64 DoCopyTo(CopyState cstate); static uint64 CopyTo(CopyState cstate, bool isFirst, bool isLast); static uint64 CopyToCompatiblePartions(CopyState cstate); void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum* values, const bool* nulls); -static uint64 CopyFrom(CopyState cstate); static void EstCopyMemInfo(Relation rel, UtilityDesc* desc); static int CopyFromCompressAndInsertBatch(PageCompress* pcState, EState* estate, CommandId mycid, int hi_options, @@ -3878,7 +3877,7 @@ void HeapAddToBulk(CopyFromBulk bulk, Tuple tup, bool needCopy) /* * Copy FROM file to relation. */ -static uint64 CopyFrom(CopyState cstate) +uint64 CopyFrom(CopyState cstate) { Tuple tuple; TupleDesc tupDesc; @@ -5567,7 +5566,8 @@ static void CopyInitCstateVar(CopyState cstate) * Returns a CopyState, to be passed to NextCopyFrom and related functions. */ CopyState BeginCopyFrom(Relation rel, const char* filename, List* attnamelist, - List* options, void* mem_info, const char* queryString) + List* options, void* mem_info, const char* queryString, + CopyGetDataFunc func) { CopyState cstate; bool pipe = (filename == NULL); @@ -5709,7 +5709,9 @@ CopyState BeginCopyFrom(Relation rel, const char* filename, List* attnamelist, cstate->volatile_defexprs = volatile_defexprs; cstate->num_defaults = num_defaults; - if (pipe) { + if (func) { + cstate->copyGetDataFunc = func; + } else if (pipe) { if (t_thrd.postgres_cxt.whereToSendOutput == DestRemote) ReceiveCopyBegin(cstate); else diff --git a/src/gausskernel/optimizer/commands/subscriptioncmds.cpp b/src/gausskernel/optimizer/commands/subscriptioncmds.cpp index 656ffbe8aa..67490ec945 100644 --- a/src/gausskernel/optimizer/commands/subscriptioncmds.cpp +++ b/src/gausskernel/optimizer/commands/subscriptioncmds.cpp @@ -20,10 +20,12 @@ #include "access/xact.h" #include "catalog/indexing.h" +#include "catalog/namespace.h" #include "catalog/objectaccess.h" #include "catalog/objectaddress.h" #include "catalog/pg_type.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/dependency.h" #include "commands/defrem.h" @@ -43,10 +45,16 @@ #include "utils/syscache.h" #include "utils/array.h" #include "utils/acl.h" +#include "access/tableam.h" +#include "libpq/libpq-fe.h" +#include "replication/slot.h" static bool ConnectPublisher(char* conninfo, char* slotname); -static void CreateSlotInPublisher(char *slotname); +static void CreateSlotInPublisherAndInsertSubRel(char *slotname, Oid subid, List *publications = NULL, + bool copy_data = false); static void ValidateReplicationSlot(char *slotname, List *publications); +static List *fetch_table_list(List *publications); +static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname); /* * Common option parsing function for CREATE and ALTER SUBSCRIPTION commands. @@ -56,7 +64,8 @@ static void ValidateReplicationSlot(char *slotname, List *publications); * accommodate that. */ static void parse_subscription_options(const List *options, char **conninfo, List **publications, bool *enabled_given, - bool *enabled, bool *slot_name_given, char **slot_name, char **synchronous_commit, bool *binary_given, bool *binary) + bool *enabled, bool *slot_name_given, char **slot_name, char **synchronous_commit, bool *binary_given, bool *binary, + bool *copy_data_given, bool *copy_data) { ListCell *lc; @@ -81,6 +90,11 @@ static void parse_subscription_options(const List *options, char **conninfo, Lis *binary = false; } + if (copy_data) { + *copy_data_given = false; + *copy_data = true; + } + /* Parse options */ foreach (lc, options) { DefElem *defel = (DefElem *)lfirst(lc); @@ -137,6 +151,15 @@ static void parse_subscription_options(const List *options, char **conninfo, Lis *binary_given = true; *binary = defGetBoolean(defel); + } else if (strcmp(defel->defname, "copy_data") == 0 && copy_data) { + if (*copy_data_given) { + ereport(ERROR, + (errcode(ERRCODE_SYNTAX_ERROR), + errmsg("conflicting or redundant options"))); + } + + *copy_data_given = true; + *copy_data = defGetBoolean(defel); } else { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("unrecognized subscription parameter: %s", defel->defname))); @@ -289,20 +312,41 @@ static bool ConnectPublisher(char* conninfo, char* slotname) } /* - * Create replication slot in publisher side. + * Create replication slot in publisher side and Insert tables into pg_subscription_rel. * Please make sure you have already connect to publisher before calling this func. */ -static void CreateSlotInPublisher(char *slotname) +static void CreateSlotInPublisherAndInsertSubRel(char *slotname, Oid subid, List *publications, bool copy_data) { LibpqrcvConnectParam options; + char table_state; + List *tables = NIL; + ListCell *lc = NULL; int rc = memset_s(&options, sizeof(LibpqrcvConnectParam), 0, sizeof(LibpqrcvConnectParam)); securec_check(rc, "", ""); options.logical = true; options.slotname = slotname; PG_TRY(); { - (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_create_slot(&options); + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_create_slot(&options, NULL, NULL); ereport(NOTICE, (errmsg("created replication slot \"%s\" on publisher", slotname))); + + /* + * Set sync state based on if we were asked to do data copy or + * not. + */ + table_state = copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY; + + /* + * Get the table list from publisher and build local table status + * info. + */ + tables = fetch_table_list(publications); + foreach (lc, tables) { + RangeVar *rv = (RangeVar *)lfirst(lc); + Oid relid = RangeVarGetRelid(rv, AccessShareLock, true); + + AddSubscriptionRelState(subid, relid, table_state); + } } PG_CATCH(); { @@ -366,6 +410,8 @@ ObjectAddress CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) bool slotname_given; bool binary; bool binary_given; + bool copy_data; + bool copy_data_given; char originname[NAMEDATALEN]; List *publications; int rc; @@ -375,7 +421,7 @@ ObjectAddress CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) * Connection and publication should not be specified here. */ parse_subscription_options(stmt->options, NULL, NULL, &enabled_given, &enabled, &slotname_given, &slotname, - &synchronous_commit, &binary_given, &binary); + &synchronous_commit, &binary_given, &binary, ©_data_given, ©_data); /* * Since creating a replication slot is not transactional, rolling back @@ -464,7 +510,7 @@ ObjectAddress CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("Failed to connect to publisher."))); } - CreateSlotInPublisher(slotname); + CreateSlotInPublisherAndInsertSubRel(slotname, subid, publications, copy_data); (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); } pfree_ext(encryptConninfo); @@ -486,10 +532,191 @@ ObjectAddress CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel) return myself; } +static void AlterSubscription_refresh(Subscription *sub, bool copy_data) +{ + List *pubrel_names = NIL; + List *subrel_states = NIL; + Oid *subrel_local_oids = NULL; + Oid *pubrel_local_oids = NULL; + ListCell *lc = NULL; + int off; + int remove_rel_len; + Relation rel = NULL; + typedef struct SubRemoveRels { + Oid relid; + char state; + } SubRemoveRels; + SubRemoveRels *sub_remove_rels = NULL; + + PG_TRY(); + { + /* Try to connect to the publisher. */ + if (!AttemptConnectPublisher(sub->conninfo, sub->slotname, true)) { + ereport(ERROR, (errmsg("could not connect to the publisher: %s", + PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); + } + + /* Get the table list from publisher. */ + pubrel_names = fetch_table_list(sub->publications); + + /* Get local table list. */ + subrel_states = GetSubscriptionRelations(sub->oid, false); + + /* + * Build qsorted array of local table oids for faster lookup. + * This can potentially contain all tables in the database so + * speed of lookup is important. + */ + subrel_local_oids = (Oid*)palloc(list_length(subrel_states) * sizeof(Oid)); + off = 0; + foreach (lc, subrel_states) { + SubscriptionRelState *relstate = (SubscriptionRelState *)lfirst(lc); + subrel_local_oids[off++] = relstate->relid; + } + qsort(subrel_local_oids, list_length(subrel_states), sizeof(Oid), oid_cmp); + + /* + * Rels that we want to remove from subscription and drop any slots + * and origins corresponding to them. + */ + sub_remove_rels = (SubRemoveRels*)palloc(list_length(subrel_states) * sizeof(SubRemoveRels)); + /* + * Walk over the remote tables and try to match them to locally + * known tables. If the table is not known locally create a new state + * for it. + * + * Also builds array of local oids of remote tables for the next step. + */ + off = 0; + pubrel_local_oids = (Oid *)palloc(list_length(pubrel_names) * sizeof(Oid)); + + foreach (lc, pubrel_names) { + RangeVar *rv = (RangeVar *)lfirst(lc); + Oid relid; + + relid = RangeVarGetRelid(rv, AccessShareLock, false); + pubrel_local_oids[off++] = relid; + + if (!bsearch(&relid, subrel_local_oids, list_length(subrel_states), sizeof(Oid), oid_cmp)) { + AddSubscriptionRelState(sub->oid, relid, copy_data ? SUBREL_STATE_INIT : SUBREL_STATE_READY); + ereport(DEBUG1, (errmsg("table \"%s.%s\" added to subscription \"%s\"", + rv->schemaname, rv->relname, sub->name))); + } + } + + /* + * Next remove state for tables we should not care about anymore using + * the data we collected above + */ + qsort(pubrel_local_oids, list_length(pubrel_names), sizeof(Oid), oid_cmp); + + remove_rel_len = 0; + for (off = 0; off < list_length(subrel_states); off++) { + Oid relid = subrel_local_oids[off]; + + if (!bsearch(&relid, pubrel_local_oids, list_length(pubrel_names), sizeof(Oid), oid_cmp)) { + char state; + XLogRecPtr statelsn; + + /* + * Lock pg_subscription_rel with AccessExclusiveLock to + * prevent any race conditions with the apply worker + * re-launching workers at the same time this code is trying + * to remove those tables. + * + * Even if new worker for this particular rel is restarted it + * won't be able to make any progress as we hold exclusive + * lock on subscription_rel till the transaction end. It will + * simply exit as there is no corresponding rel entry. + * + * This locking also ensures that the state of rels won't + * change till we are done with this refresh operation. + */ + if (!rel) + rel = heap_open(SubscriptionRelRelationId, AccessExclusiveLock); + + /* Last known rel state. */ + state = GetSubscriptionRelState(sub->oid, relid, &statelsn); + + sub_remove_rels[remove_rel_len].relid = relid; + sub_remove_rels[remove_rel_len++].state = state; + + RemoveSubscriptionRel(sub->oid, relid); + + logicalrep_worker_stop(sub->oid, relid); + + /* + * For READY state, we would have already dropped the + * tablesync origin. + */ + if (state != SUBREL_STATE_READY) { + char originname[NAMEDATALEN]; + + /* + * Drop the tablesync's origin tracking if exists. + * + * It is possible that the origin is not yet created for + * tablesync worker, this can happen for the states before + * SUBREL_STATE_FINISHEDCOPY. The apply worker can also + * concurrently try to drop the origin and by this time + * the origin might be already removed. For these reasons, + * passing missing_ok = true. + */ + ReplicationOriginNameForTablesync(sub->oid, relid, originname, sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + } + + ereport(DEBUG1, (errmsg("table \"%s.%s\" removed from subscription \"%s\"", + get_namespace_name(get_rel_namespace(relid)), get_rel_name(relid), sub->name))); + } + } + + /* + * Drop the tablesync slots associated with removed tables. This has + * to be at the end because otherwise if there is an error while doing + * the database operations we won't be able to rollback dropped slots. + */ + for (off = 0; off < remove_rel_len; off++) { + if (sub_remove_rels[off].state != SUBREL_STATE_READY && + sub_remove_rels[off].state != SUBREL_STATE_SYNCDONE) { + char syncslotname[NAMEDATALEN] = {0}; + + /* + * For READY/SYNCDONE states we know the tablesync slot has + * already been dropped by the tablesync worker. + * + * For other states, there is no certainty, maybe the slot + * does not exist yet. Also, if we fail after removing some of + * the slots, next time, it will again try to drop already + * dropped slots and fail. For these reasons, we allow + * missing_ok = true for the drop. + */ + ReplicationSlotNameForTablesync(sub->oid, sub_remove_rels[off].relid, syncslotname, + sizeof(syncslotname)); + ReplicationSlotDropAtPubNode(syncslotname, true); + } + } + } + PG_CATCH(); + { + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); + + if (rel) + heap_close(rel, NoLock); + PG_RE_THROW(); + } + PG_END_TRY(); + + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); + + if (rel) + heap_close(rel, NoLock); +} + /* * Alter the existing subscription. */ -ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt) +ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt, bool isTopLevel) { if (t_thrd.proc->workingVersionNum < PUBLICATION_VERSION_NUM) { ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), @@ -511,6 +738,8 @@ ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt) char *conninfo; char *slot_name; bool slotname_given; + bool copy_data; + bool copy_data_given; List *publications; Subscription *sub; int rc; @@ -535,13 +764,21 @@ ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt) subid = HeapTupleGetOid(tup); sub = GetSubscription(subid, false); + + /* Lock the subscription so nobody else can do anything with it. */ + LockSharedObject(SubscriptionRelationId, subid, 0, AccessExclusiveLock); + enabled = sub->enabled; finalSlotName = sub->name; encryptConninfo = sub->conninfo; /* Parse options. */ parse_subscription_options(stmt->options, &conninfo, &publications, &enabled_given, &enabled, &slotname_given, - &slot_name, &synchronous_commit, &binary_given, &binary); + &slot_name, &synchronous_commit, &binary_given, &binary, ©_data_given, ©_data); + + if (stmt->refresh) { + PreventTransactionChain(isTopLevel, "ALTER SUBSCRIPTION ... REFRESH"); + } /* Form a new tuple. */ rc = memset_s(nulls, sizeof(nulls), false, sizeof(nulls)); @@ -648,7 +885,7 @@ ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt) } if (createSlot) { - CreateSlotInPublisher(finalSlotName); + CreateSlotInPublisherAndInsertSubRel(finalSlotName, subid, publications); } /* no need to validate replication slot if the slot is created just by ourself */ @@ -660,6 +897,10 @@ ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt) ApplyLauncherWakeupAtCommit(); } + if (stmt->refresh) { + AlterSubscription_refresh(sub, copy_data); + } + if (needFreeConninfo) { pfree_ext(encryptConninfo); } @@ -688,9 +929,8 @@ void DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) List *subWorkers; ListCell *lc; char originname[NAMEDATALEN]; - char *err = NULL; - StringInfoData cmd; int rc; + List *rstates = NIL; /* * Lock pg_subscription with AccessExclusiveLock to ensure that the @@ -766,9 +1006,6 @@ void DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) ReleaseSysCache(tup); - /* Clean up dependencies */ - deleteSharedDependencyRecordsFor(SubscriptionRelationId, subid, 0); - /* * Stop all the subscription workers immediately. * @@ -791,47 +1028,107 @@ void DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) foreach (lc, subWorkers) { LogicalRepWorker *w = (LogicalRepWorker *)lfirst(lc); - logicalrep_worker_stop(w->subid); + logicalrep_worker_stop(w->subid, w->relid); } list_free(subWorkers); + /* + * Cleanup of tablesync replication origins. + * + * Any READY-state relations would already have dealt with clean-ups. + * + * Note that the state can't change because we have already stopped both + * the apply and tablesync workers and they can't restart because of + * exclusive lock on the subscription. + */ + rstates = GetSubscriptionRelations(subid, true); + foreach (lc, rstates) { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + Oid relid = rstate->relid; + + /* Only cleanup resources of tablesync workers */ + if (!OidIsValid(relid)) + continue; + + /* + * Drop the tablesync's origin tracking if exists. + * + * It is possible that the origin is not yet created for tablesync + * worker so passing missing_ok = true. This can happen for the states + * before SUBREL_STATE_FINISHEDCOPY. + */ + ReplicationOriginNameForTablesync(subid, relid, originname, sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + } + + /* Clean up dependencies */ + deleteSharedDependencyRecordsFor(SubscriptionRelationId, subid, 0); + + /* Remove any associated relation synchronization states. */ + RemoveSubscriptionRel(subid, InvalidOid); + /* Remove the origin tracking if exists. */ rc = sprintf_s(originname, sizeof(originname), "pg_%u", subid); securec_check_ss(rc, "", ""); - replorigin_drop_by_name(originname, true); + replorigin_drop_by_name(originname, true, false); /* If there is no slot associated with the subscription, we can finish here. */ - if (!slotname) { + if (!slotname && rstates == NIL) { + pfree_ext(conninfo); heap_close(rel, NoLock); return; } - /* - * Otherwise drop the replication slot at the publisher node using - * the replication connection. - */ - initStringInfo(&cmd); - appendStringInfo(&cmd, "DROP_REPLICATION_SLOT %s", quote_identifier(slotname)); - if (!AttemptConnectPublisher(conninfo, slotname, true)) { - ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg( - "could not connect to publisher."))); + if (!slotname) { + /* be tidy */ + list_free(rstates); + pfree_ext(conninfo); + heap_close(rel, NoLock); + return; + } else { + ReportSlotConnectionError(rstates, subid, slotname); + } } PG_TRY(); { - int sqlstate = 0; - bool res = WalReceiverFuncTable[GET_FUNC_IDX].walrcv_command(cmd.data, &err, &sqlstate); - if (!res && sqlstate == ERRCODE_UNDEFINED_OBJECT) { - /* drop replication slot failed cause it doesn't exist on publisher, give a warning and continue */ - ereport(WARNING, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), - errdetail("The error was: %s", err))); - } else if (!res) { - ereport(ERROR, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), - errdetail("The error was: %s", err))); - } else { - ereport(NOTICE, (errmsg("dropped replication slot \"%s\" on publisher", slotname))); + foreach (lc, rstates) { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + Oid relid = rstate->relid; + + /* Only cleanup resources of tablesync workers */ + if (!OidIsValid(relid)) + continue; + + /* + * Drop the tablesync slots associated with removed tables. + * + * For SYNCDONE/READY states, the tablesync slot is known to have + * already been dropped by the tablesync worker. + * + * For other states, there is no certainty, maybe the slot does + * not exist yet. Also, if we fail after removing some of the + * slots, next time, it will again try to drop already dropped + * slots and fail. For these reasons, we allow missing_ok = true + * for the drop. + */ + if (rstate->state != SUBREL_STATE_SYNCDONE) { + char syncslotname[NAMEDATALEN] = {0}; + + ReplicationSlotNameForTablesync(subid, relid, syncslotname, sizeof(syncslotname)); + ReplicationSlotDropAtPubNode(syncslotname, true); + } } + + list_free(rstates); + + /* + * If there is a slot associated with the subscription, then drop the + * replication slot at the publisher. + */ + if (slotname) + ReplicationSlotDropAtPubNode(slotname, true); } PG_CATCH(); { @@ -844,10 +1141,50 @@ void DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel) (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); pfree_ext(conninfo); - pfree(cmd.data); heap_close(rel, NoLock); } +/* + * Drop the replication slot at the publisher node using the replication + * connection. + * + * missing_ok - if true then only issue a LOG message if the slot doesn't + * exist. + */ +void ReplicationSlotDropAtPubNode(char *slotname, bool missing_ok) +{ + StringInfoData cmd; + + Assert(t_thrd.libwalreceiver_cxt.streamConn); + + initStringInfo(&cmd); + appendStringInfo(&cmd, "DROP_REPLICATION_SLOT %s", quote_identifier(slotname)); + + PG_TRY(); + { + WalRcvExecResult *res = WalReceiverFuncTable[GET_FUNC_IDX].walrcv_exec(cmd.data, 0, NULL); + if (res->status != WALRCV_OK_COMMAND && missing_ok && res->sqlstate == ERRCODE_UNDEFINED_OBJECT) { + /* drop replication slot failed cause it doesn't exist on publisher, give a warning and continue */ + ereport(WARNING, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), + errdetail("The error was: %s", res->err))); + } else if (res->status != WALRCV_OK_COMMAND) { + ereport(ERROR, (errmsg("could not drop the replication slot \"%s\" on publisher", slotname), + errdetail("The error was: %s", res->err))); + } else { + ereport(NOTICE, (errmsg("dropped replication slot \"%s\" on publisher", slotname))); + } + walrcv_clear_result(res); + } + PG_CATCH(); + { + FreeStringInfo(&cmd); + PG_RE_THROW(); + } + PG_END_TRY(); + + FreeStringInfo(&cmd); +} + /* * Internal workhorse for changing a subscription owner */ @@ -1119,3 +1456,103 @@ char* EncryptOrDecryptConninfo(const char* conninfo, const char action) return conninfoNew; } + +/* + * Get the list of tables which belong to specified publications on the + * publisher connection. + */ +static List* fetch_table_list(List *publications) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {TEXTOID, TEXTOID}; + ListCell *lc; + bool first; + List *tablelist = NIL; + + Assert(list_length(publications) > 0); + + initStringInfo(&cmd); + appendStringInfo(&cmd, "SELECT DISTINCT t.schemaname, t.tablename\n" + " FROM pg_catalog.pg_publication_tables t\n" + " WHERE t.pubname IN ("); + first = true; + foreach (lc, publications) { + char *pubname = strVal(lfirst(lc)); + + if (first) + first = false; + else + appendStringInfoString(&cmd, ", "); + + appendStringInfo(&cmd, "%s", quote_literal_cstr(pubname)); + } + appendStringInfoString(&cmd, ")"); + + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec(cmd.data, 2, tableRow); + pfree(cmd.data); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not receive list of replicated tables from the publisher: %s", + res->err))); + + /* Process tables. */ + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) { + char *nspname; + char *relname; + bool isnull; + RangeVar *rv; + + nspname = TextDatumGetCString(tableam_tslot_getattr(slot, 1, &isnull)); + Assert(!isnull); + relname = TextDatumGetCString(tableam_tslot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + rv = makeRangeVar(pstrdup(nspname), pstrdup(relname), -1); + tablelist = lappend(tablelist, rv); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + walrcv_clear_result(res); + + return tablelist; +} + +/* + * This is to report the connection failure while dropping replication slots. + * Here, we report the WARNING for all tablesync slots so that user can drop + * them manually, if required. + */ +static void ReportSlotConnectionError(List *rstates, Oid subid, char *slotname) +{ + ListCell *lc; + + foreach (lc, rstates) { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + Oid relid = rstate->relid; + + /* Only cleanup resources of tablesync workers */ + if (!OidIsValid(relid)) + continue; + + /* + * Caller needs to ensure that relstate doesn't change underneath us. + * See DropSubscription where we get the relstates. + */ + if (rstate->state != SUBREL_STATE_SYNCDONE) { + char syncslotname[NAMEDATALEN] = {0}; + + ReplicationSlotNameForTablesync(subid, relid, syncslotname, sizeof(syncslotname)); + ereport(WARNING, (errmsg("could not drop tablesync replication slot \"%s\"", syncslotname))); + } + } + + ereport(ERROR, (errmsg("could not connect to publisher when attempting to " + "drop the replication slot \"%s\"", slotname), + errdetail("The error was: %s", PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); +} diff --git a/src/gausskernel/process/postmaster/pgstat.cpp b/src/gausskernel/process/postmaster/pgstat.cpp index 79d69bdb9e..b6ecebb2ef 100644 --- a/src/gausskernel/process/postmaster/pgstat.cpp +++ b/src/gausskernel/process/postmaster/pgstat.cpp @@ -783,10 +783,10 @@ static void pgstat_free_tablist(void) /* ---------- * pgstat_report_stat() - * - * Called from tcop/postgres.c to send the so far collected per-table - * and function usage statistics to the collector. Note that this is - * called only when not within a transaction, so it is fair to use - * transaction stop time as an approximation of current time. + * Must be called by processes that performs DML: tcop/postgres.c, logical + * receiver processes, SPI worker, etc. to send the so far collected + * per-table and function usage statistics to the collector. Note that this + * is called only when not within a transaction, so it is fair to use * ---------- */ void pgstat_report_stat(bool force) @@ -4571,6 +4571,15 @@ const char* pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_UNDO_META_SYNC: event_name = "UndoMetaSync"; break; + case WAIT_EVENT_LOGICAL_SYNC_DATA: + event_name = "LogicalSyncData"; + break; + case WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE: + event_name = "LogicalSyncStateChange"; + break; + case WAIT_EVENT_REPLICATION_ORIGIN_DROP: + event_name = "ReplicationOriginDrop"; + break; default: event_name = "unknown wait event"; break; diff --git a/src/gausskernel/process/tcop/utility.cpp b/src/gausskernel/process/tcop/utility.cpp index d230593fd5..8df7e05cb1 100755 --- a/src/gausskernel/process/tcop/utility.cpp +++ b/src/gausskernel/process/tcop/utility.cpp @@ -7019,7 +7019,7 @@ void standard_ProcessUtility(Node* parse_tree, const char* query_string, ParamLi errmsg("openGauss does not support SUBSCRIPTION yet"), errdetail("The feature is not currently supported"))); #endif - AlterSubscription((AlterSubscriptionStmt *) parse_tree); + AlterSubscription((AlterSubscriptionStmt *) parse_tree, is_top_level); break; case T_DropSubscriptionStmt: #if defined(ENABLE_MULTIPLE_NODES) || defined(ENABLE_LITE_MODE) diff --git a/src/gausskernel/process/threadpool/knl_thread.cpp b/src/gausskernel/process/threadpool/knl_thread.cpp index 068d74627c..9dd53d605b 100755 --- a/src/gausskernel/process/threadpool/knl_thread.cpp +++ b/src/gausskernel/process/threadpool/knl_thread.cpp @@ -1471,6 +1471,7 @@ static void knl_t_walsender_init(knl_t_walsender_context* walsender_cxt) walsender_cxt->is_obsmode = false; walsender_cxt->standbyConnection = false; walsender_cxt->restoreLogicalLogHead = NULL; + walsender_cxt->isUseSnapshot = false; } static void knl_t_tsearch_init(knl_t_tsearch_context* tsearch_cxt) diff --git a/src/gausskernel/storage/ipc/pmsignal.cpp b/src/gausskernel/storage/ipc/pmsignal.cpp index 5badb9836b..62b9aa389b 100644 --- a/src/gausskernel/storage/ipc/pmsignal.cpp +++ b/src/gausskernel/storage/ipc/pmsignal.cpp @@ -324,8 +324,13 @@ void MarkPostmasterChildNormal(void) Assert(slot > 0 && slot <= t_thrd.shemem_ptr_cxt.PMSignalState->num_child_flags); slot--; + /* + * Walsender for subscription would create slot and start replication + * during one connect. So it can be PM_CHILD_ACTIVE here. + */ Assert(t_thrd.shemem_ptr_cxt.PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER || - t_thrd.shemem_ptr_cxt.PMSignalState->PMChildFlags[slot] == PM_CHILD_DATASENDER); + t_thrd.shemem_ptr_cxt.PMSignalState->PMChildFlags[slot] == PM_CHILD_DATASENDER || + t_thrd.shemem_ptr_cxt.PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE); t_thrd.shemem_ptr_cxt.PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE; } diff --git a/src/gausskernel/storage/ipc/procarray.cpp b/src/gausskernel/storage/ipc/procarray.cpp index 1da108c3d6..2599635549 100755 --- a/src/gausskernel/storage/ipc/procarray.cpp +++ b/src/gausskernel/storage/ipc/procarray.cpp @@ -1988,7 +1988,7 @@ RETRY_GET: * * Returns TRUE if successful, FALSE if source xact is no longer running. */ -bool ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid) +bool ProcArrayInstallImportedXmin(TransactionId xmin, VirtualTransactionId *sourcevxid) { bool result = false; ProcArrayStruct* arrayP = g_instance.proc_array_idx; @@ -1996,7 +1996,7 @@ bool ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid) Assert(TransactionIdIsNormal(xmin)); - if (!TransactionIdIsNormal(sourcexid)) + if (!sourcevxid) return false; /* Get lock so source xact can't end while we're doing this */ @@ -2008,9 +2008,10 @@ bool ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid) volatile PGXACT* pgxact = &g_instance.proc_base_all_xacts[pgprocno]; TransactionId xid; - xid = pgxact->xid; /* fetch just once */ - - if (xid != sourcexid) + /* We are only interested in the specific virtual transaction. */ + if (proc->backendId != sourcevxid->backendId) + continue; + if (proc->lxid != sourcevxid->localTransactionId) continue; /* diff --git a/src/gausskernel/storage/lmgr/predicate.cpp b/src/gausskernel/storage/lmgr/predicate.cpp index 3c09a88ec1..823ff45e03 100644 --- a/src/gausskernel/storage/lmgr/predicate.cpp +++ b/src/gausskernel/storage/lmgr/predicate.cpp @@ -149,7 +149,7 @@ * predicate lock maintenance * GetSerializableTransactionSnapshot(Snapshot snapshot) * SetSerializableTransactionSnapshot(Snapshot snapshot, - * TransactionId sourcexid) + * VirtualTransactionId *sourcevxid) * RegisterPredicateLockingXid(void) * PredicateLockRelation(Relation relation, Snapshot snapshot) * PredicateLockPage(Relation relation, BlockNumber blkno, @@ -353,7 +353,8 @@ static void OldSerXidSetActiveSerXmin(TransactionId xid); static uint32 predicatelock_hash(const void *key, Size keysize); static void SummarizeOldestCommittedSxact(void); static Snapshot GetSafeSnapshot(Snapshot snapshot); -static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, TransactionId sourcexid); +static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, VirtualTransactionId *sourcevxid, + int sourcepid); static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag); static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG* tag, PREDICATELOCKTARGETTAG* parent); static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG* newtargettag); @@ -1312,7 +1313,7 @@ static Snapshot GetSafeSnapshot(Snapshot origSnapshot) * our caller passed to us. The pointer returned is actually the same * one passed to it, but we avoid assuming that here. */ - snapshot = GetSerializableTransactionSnapshotInt(origSnapshot, InvalidTransactionId); + snapshot = GetSerializableTransactionSnapshotInt(origSnapshot, NULL, InvalidPid); if (t_thrd.xact_cxt.MySerializableXact == InvalidSerializableXact) return snapshot; /* no concurrent r/w xacts; it's safe */ @@ -1390,7 +1391,7 @@ Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot) if (u_sess->attr.attr_common.XactReadOnly && u_sess->attr.attr_storage.XactDeferrable) return GetSafeSnapshot(snapshot); - return GetSerializableTransactionSnapshotInt(snapshot, InvalidTransactionId); + return GetSerializableTransactionSnapshotInt(snapshot, NULL, InvalidPid); } /* @@ -1403,7 +1404,7 @@ Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot) * transaction; and if we're read-write, the source transaction must not be * read-only. */ -void SetSerializableTransactionSnapshot(Snapshot snapshot, TransactionId sourcexid) +void SetSerializableTransactionSnapshot(Snapshot snapshot, VirtualTransactionId *sourcevxid, int sourcepid) { Assert(IsolationIsSerializable()); @@ -1417,7 +1418,7 @@ void SetSerializableTransactionSnapshot(Snapshot snapshot, TransactionId sourcex ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE"))); - (void)GetSerializableTransactionSnapshotInt(snapshot, sourcexid); + (void)GetSerializableTransactionSnapshotInt(snapshot, sourcevxid, sourcepid); } /* @@ -1429,7 +1430,8 @@ void SetSerializableTransactionSnapshot(Snapshot snapshot, TransactionId sourcex * source xact is still running after we acquire SerializableXactHashLock. * We do that by calling ProcArrayInstallImportedXmin. */ -static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, TransactionId sourcexid) +static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, VirtualTransactionId *sourcevxid, + int sourcepid) { PGPROC *proc = NULL; VirtualTransactionId vxid; @@ -1473,14 +1475,14 @@ static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, Transac } while (sxact == NULL); /* Get the snapshot, or check that it's safe to use */ - if (!TransactionIdIsValid(sourcexid)) + if (!sourcevxid) snapshot = GetSnapshotData(snapshot, false); - else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcexid)) { + else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid)) { ReleasePredXact(sxact); LWLockRelease(SerializableXactHashLock); ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("could not import the requested snapshot"), - errdetail("The source transaction " XID_FMT " is not running anymore.", sourcexid))); + errdetail("The source process with pid %d is not running anymore.", sourcepid))); } /* diff --git a/src/gausskernel/storage/replication/libpqwalreceiver.cpp b/src/gausskernel/storage/replication/libpqwalreceiver.cpp index 30a338a1a1..bd6c1e8bd8 100755 --- a/src/gausskernel/storage/replication/libpqwalreceiver.cpp +++ b/src/gausskernel/storage/replication/libpqwalreceiver.cpp @@ -23,6 +23,7 @@ #include "libpq/libpq-int.h" #include "access/xlog.h" #include "access/xlog_internal.h" +#include "funcapi.h" #include "miscadmin.h" #include "replication/walreceiver.h" #include "replication/walsender_private.h" @@ -44,6 +45,9 @@ #ifdef HAVE_SYS_SELECT_H #include #endif +#include "utils/int8.h" +#include "utils/pg_lsn.h" +#include "utils/builtins.h" /* Prototypes for private functions */ static bool libpq_select(int timeout_ms); @@ -275,28 +279,44 @@ void StartRemoteStreaming(const LibpqrcvConnectParam *options) PQclear(res); } -void CreateRemoteReplicationSlot(XLogRecPtr startpoint, const char* slotname, bool isLogical) +void CreateRemoteReplicationSlot(XLogRecPtr startpoint, const char* slotname, bool isLogical, XLogRecPtr *lsn, + bool useSnapshot, CommitSeqNo *csn) { Assert(t_thrd.libwalreceiver_cxt.streamConn != NULL); - char cmd[1024]; - int nRet = 0; + StringInfoData cmd; + + initStringInfo(&cmd); + + appendStringInfo(&cmd, "CREATE_REPLICATION_SLOT \"%s\"", slotname); if (isLogical) { - nRet = snprintf_s(cmd, sizeof(cmd), sizeof(cmd) - 1, "CREATE_REPLICATION_SLOT \"%s\" LOGICAL pgoutput", - slotname); + appendStringInfoString(&cmd, " LOGICAL pgoutput"); } else { - nRet = snprintf_s(cmd, sizeof(cmd), sizeof(cmd) - 1, "CREATE_REPLICATION_SLOT \"%s\" PHYSICAL %X/%X", slotname, - (uint32)(startpoint >> 32), (uint32)(startpoint)); + appendStringInfo(&cmd, " PHYSICAL %X/%X", (uint32)(startpoint >> 32), (uint32)(startpoint)); + } + + if (useSnapshot) { + appendStringInfoString(&cmd, " USE_SNAPSHOT"); } - securec_check_ss(nRet, "", ""); - PGresult *res = libpqrcv_PQexec(cmd); + PGresult *res = libpqrcv_PQexec(cmd.data); if (PQresultStatus(res) != PGRES_TUPLES_OK) { PQclear(res); + pfree(cmd.data); ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), errmsg("could not create replication slot %s : %s", slotname, PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); } + + if (lsn) { + *lsn = DatumGetLSN(DirectFunctionCall1Coll(pg_lsn_in, InvalidOid, CStringGetDatum(PQgetvalue(res, 0, 1)))); + } + + if (useSnapshot && csn) { + *csn = (CommitSeqNo)DatumGetInt64( + DirectFunctionCall1Coll(int8in, InvalidOid, CStringGetDatum(PQgetvalue(res, 0, 4)))); + } + pfree(cmd.data); PQclear(res); } @@ -1038,7 +1058,7 @@ retry: } if (!t_thrd.walreceiver_cxt.AmWalReceiverForFailover && slotname != NULL) { - CreateRemoteReplicationSlot(*startpoint, slotname, false); + CreateRemoteReplicationSlot(*startpoint, slotname, false, NULL); } /* Start streaming from the point requested by startup process */ @@ -1299,9 +1319,14 @@ bool libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len) res = PQgetResult(t_thrd.libwalreceiver_cxt.streamConn); if (PQresultStatus(res) == PGRES_COMMAND_OK) { PQclear(res); - ereport(ERROR, (errcode(ERRCODE_INVALID_STATUS), - errmsg("replication terminated by primary server at %X/%X", - (uint32)(walrcv->receivedUpto >> 32), (uint32)walrcv->receivedUpto))); + + /* Verify that there are no more results */ + res = PQgetResult(t_thrd.libwalreceiver_cxt.streamConn); + if (res != NULL) + ereport(ERROR, + (errmsg("unexpected result after CommandComplete: %s", + PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); + *len = -1; return false; } @@ -1328,6 +1353,11 @@ bool libpqrcv_receive(int timeout, unsigned char *type, char **buffer, int *len) } /* Return received messages to caller */ + if (type == NULL) { + *buffer = t_thrd.libwalreceiver_cxt.recvBuf; + *len = rawlen; + return true; + } *type = *((unsigned char *)t_thrd.libwalreceiver_cxt.recvBuf); if (IS_SHARED_STORAGE_MODE && !AM_HADR_WAL_RECEIVER && *type == 'w') { *len = 0; @@ -1352,25 +1382,131 @@ void libpqrcv_send(const char *buffer, int nbytes) PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); } -bool libpqrcv_command(const char *cmd, char **err, int *sqlstate) +/* + * Convert tuple query result to tuplestore. + */ +static void libpqrcv_processTuples(PGresult *pgres, WalRcvExecResult *walres, const int nRetTypes, const Oid *retTypes) { - PGresult *res = libpqrcv_PQexec(cmd); + int tupn; + int coln; + int nfields = PQnfields(pgres); + HeapTuple tuple; + AttInMetadata *attinmeta; + MemoryContext rowcontext; + MemoryContext oldcontext; + + /* No point in doing anything here if there were no tuples returned. */ + if (PQntuples(pgres) == 0) + return; - if (PQresultStatus(res) != PGRES_COMMAND_OK) { - PQclear(res); - *err = pstrdup(PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)); - if (sqlstate != NULL && t_thrd.libwalreceiver_cxt.streamConn != NULL) { - *sqlstate = MAKE_SQLSTATE(t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[0], - t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[1], - t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[2], - t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[3], - t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[4]); + /* Make sure we got expected number of fields. */ + if (nfields != nRetTypes) + ereport(ERROR, + (errmsg("invalid query responser"), errdetail("Expected %d fields, got %d fields.", nRetTypes, nfields))); + + walres->tuplestore = tuplestore_begin_heap(true, false, u_sess->attr.attr_memory.work_mem); + + /* Create tuple descriptor corresponding to expected result. */ + walres->tupledesc = CreateTemplateTupleDesc(nRetTypes, false); + for (coln = 0; coln < nRetTypes; coln++) + TupleDescInitEntry(walres->tupledesc, (AttrNumber)coln + 1, PQfname(pgres, coln), retTypes[coln], -1, 0); + attinmeta = TupleDescGetAttInMetadata(walres->tupledesc); + + /* Create temporary context for local allocations. */ + rowcontext = AllocSetContextCreate(CurrentMemoryContext, "libpqrcv query result context", ALLOCSET_DEFAULT_SIZES); + + /* Process returned rows. */ + for (tupn = 0; tupn < PQntuples(pgres); tupn++) { + char *cstrs[MaxTupleAttributeNumber]; + + CHECK_FOR_INTERRUPTS(); + + /* Do the allocations in temporary context. */ + oldcontext = MemoryContextSwitchTo(rowcontext); + + /* + * Fill cstrs with null-terminated strings of column values. + */ + for (coln = 0; coln < nfields; coln++) { + if (PQgetisnull(pgres, tupn, coln)) + cstrs[coln] = NULL; + else + cstrs[coln] = PQgetvalue(pgres, tupn, coln); } - return false; + + /* Convert row to a tuple, and add it to the tuplestore */ + tuple = BuildTupleFromCStrings(attinmeta, cstrs); + tuplestore_puttuple(walres->tuplestore, tuple); + + /* Clean up */ + MemoryContextSwitchTo(oldcontext); + MemoryContextReset(rowcontext); } - PQclear(res); - return true; + MemoryContextDelete(rowcontext); +} + +/* + * Public interface for sending generic queries (and commands). + * + * This can only be called from process connected to database. + */ +WalRcvExecResult* libpqrcv_exec(const char *query, const int nRetTypes, const Oid *retTypes) +{ + PGresult *pgres = NULL; + WalRcvExecResult *walres = (WalRcvExecResult *)palloc0(sizeof(WalRcvExecResult)); + + if (u_sess->proc_cxt.MyDatabaseId == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("the query interface requires a database connection"))); + + pgres = libpqrcv_PQexec(query); + + switch (PQresultStatus(pgres)) { + case PGRES_SINGLE_TUPLE: + case PGRES_TUPLES_OK: + walres->status = WALRCV_OK_TUPLES; + libpqrcv_processTuples(pgres, walres, nRetTypes, retTypes); + break; + + case PGRES_COPY_IN: + walres->status = WALRCV_OK_COPY_IN; + break; + + case PGRES_COPY_OUT: + walres->status = WALRCV_OK_COPY_OUT; + break; + + case PGRES_COPY_BOTH: + walres->status = WALRCV_OK_COPY_BOTH; + break; + + case PGRES_COMMAND_OK: + walres->status = WALRCV_OK_COMMAND; + break; + + /* Empty query is considered error. */ + case PGRES_EMPTY_QUERY: + walres->status = WALRCV_ERROR; + walres->err = _("empty query"); + break; + + case PGRES_NONFATAL_ERROR: + case PGRES_FATAL_ERROR: + case PGRES_BAD_RESPONSE: + walres->status = WALRCV_ERROR; + walres->err = pstrdup(PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)); + walres->sqlstate = MAKE_SQLSTATE(t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[0], + t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[1], + t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[2], + t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[3], + t_thrd.libwalreceiver_cxt.streamConn->last_sqlstate[4]); + break; + } + + PQclear(pgres); + return walres; } void HaSetRebuildRepInfoError(HaRebuildReason reason) diff --git a/src/gausskernel/storage/replication/logical/Makefile b/src/gausskernel/storage/replication/logical/Makefile index 176e913a9f..44963bc28c 100644 --- a/src/gausskernel/storage/replication/logical/Makefile +++ b/src/gausskernel/storage/replication/logical/Makefile @@ -6,6 +6,6 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -I$(srcdir) $(CPPFLAGS) -OBJS = decode.o launcher.o logical.o logicalfuncs.o origin.o proto.o relation.o reorderbuffer.o snapbuild.o worker.o parallel_decode_worker.o parallel_decode.o parallel_reorderbuffer.o logical_queue.o logical_parse.o +OBJS = decode.o launcher.o logical.o logicalfuncs.o origin.o proto.o relation.o reorderbuffer.o snapbuild.o worker.o parallel_decode_worker.o parallel_decode.o parallel_reorderbuffer.o logical_queue.o logical_parse.o tablesync.o include $(top_srcdir)/src/gausskernel/common.mk diff --git a/src/gausskernel/storage/replication/logical/launcher.cpp b/src/gausskernel/storage/replication/logical/launcher.cpp index 2a531feea6..613a1edbb9 100644 --- a/src/gausskernel/storage/replication/logical/launcher.cpp +++ b/src/gausskernel/storage/replication/logical/launcher.cpp @@ -27,6 +27,7 @@ #include "access/xact.h" #include "catalog/pg_subscription.h" +#include "catalog/pg_subscription_rel.h" #include "catalog/pg_database.h" #include "commands/user.h" @@ -58,7 +59,7 @@ static const int DEFAULT_NAPTIME_PER_CYCLE = 180000L; static const int wal_retrieve_retry_interval = 5000; -static const int PG_STAT_GET_SUBSCRIPTION_COLS = 7; +static const int PG_STAT_GET_SUBSCRIPTION_COLS = 8; static const int WAIT_SUB_WORKER_ATTACH_CYCLE = 50000L; /* 50ms */ static const int WAIT_SUB_WORKER_ATTACH_TIMEOUT = 1000000L; /* 1s */ @@ -157,9 +158,9 @@ List *logicalrep_workers_find(Oid subid, bool only_running) /* * Walks the workers array and searches for one that matches given - * subscription id. + * subscription id and relid. */ -static LogicalRepWorker *logicalrep_worker_find(Oid subid) +LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid, bool only_running) { int i; LogicalRepWorker *res = NULL; @@ -168,7 +169,7 @@ static LogicalRepWorker *logicalrep_worker_find(Oid subid) /* Search for attached worker for a given subscription id. */ for (i = 0; i < g_instance.attr.attr_storage.max_logical_replication_workers; i++) { LogicalRepWorker *w = &t_thrd.applylauncher_cxt.applyLauncherShm->workers[i]; - if (w->subid == subid && w->proc) { + if (w->subid == subid && w->relid == relid && (!only_running || w->proc)) { res = w; break; } @@ -228,10 +229,9 @@ static void WaitForReplicationWorkerAttach() /* * Start new apply background worker. */ -static void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid) +void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid) { int slot; - int rc; LogicalRepWorker *worker = NULL; ereport(DEBUG1, (errmsg("starting logical replication worker for subscription \"%s\"", subname))); @@ -265,11 +265,19 @@ static void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, O } /* Prepare the worker info. */ - rc = memset_s(worker, sizeof(LogicalRepWorker), 0, sizeof(LogicalRepWorker)); - securec_check(rc, "", ""); + worker->proc = NULL; worker->dbid = dbid; worker->userid = userid; worker->subid = subid; + worker->relid = relid; + worker->relstate = SUBREL_STATE_UNKNOWN; + worker->relstate_lsn = InvalidXLogRecPtr; + worker->relcsn = InvalidCommitSeqNo; + worker->last_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->last_send_time); + TIMESTAMP_NOBEGIN(worker->last_recv_time); + worker->reply_lsn = InvalidXLogRecPtr; + TIMESTAMP_NOBEGIN(worker->reply_time); worker->workerLaunchTime = GetCurrentTimestamp(); t_thrd.applylauncher_cxt.applyLauncherShm->startingWorker = worker; @@ -283,13 +291,13 @@ static void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, O * Stop the logical replication worker for subid/relid, if any, and wait until * it detaches from the slot. */ -void logicalrep_worker_stop(Oid subid) +void logicalrep_worker_stop(Oid subid, Oid relid) { LogicalRepWorker *worker; LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - worker = logicalrep_worker_find(subid); + worker = logicalrep_worker_find(subid, relid, false); /* No worker, nothing to do. */ if (!worker) { LWLockRelease(LogicalRepWorkerLock); @@ -359,6 +367,34 @@ void logicalrep_worker_stop(Oid subid) LWLockRelease(LogicalRepWorkerLock); } +/* + * Wake up (using latch) any logical replication worker for specified sub/rel. + */ +void logicalrep_worker_wakeup(Oid subid, Oid relid) +{ + LogicalRepWorker *worker; + + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + + worker = logicalrep_worker_find(subid, relid, true); + + if (worker) + logicalrep_worker_wakeup_ptr(worker); + + LWLockRelease(LogicalRepWorkerLock); +} + +/* + * Wake up (using latch) the specified logical replication worker. + * + * Caller must hold lock, else worker->proc could change under us. + */ +void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker) +{ + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + SetLatch(&worker->proc->procLatch); +} + /* * Attach to a slot. */ @@ -478,6 +514,27 @@ static void LogicalrepLauncherSigusr1(SIGNAL_ARGS) errno = save_errno; } +/* + * Count the number of registered (not necessarily running) sync workers + * for a subscription. + */ +int logicalrep_sync_worker_count(Oid subid) +{ + int i; + int res = 0; + + Assert(LWLockHeldByMe(LogicalRepWorkerLock)); + + /* Search for attached worker for a given subscription id. */ + for (i = 0; i < g_instance.attr.attr_storage.max_logical_replication_workers; i++) { + LogicalRepWorker *w = &t_thrd.applylauncher_cxt.applyLauncherShm->workers[i]; + if (w->subid == subid && OidIsValid(w->relid)) + res++; + } + + return res; +} + /* * ApplyLauncherShmemSize * Compute space needed for replication launcher shared memory @@ -510,8 +567,19 @@ void ApplyLauncherShmemInit(void) (ApplyLauncherShmStruct *)ShmemInitStruct("Logical Replication Launcher Data", memSize, &found); if (!found) { + int slot; + rc = memset_s(t_thrd.applylauncher_cxt.applyLauncherShm, memSize, 0, memSize); securec_check(rc, "", ""); + + /* Initialize memory and spin locks for each worker slot. */ + for (slot = 0; slot < g_instance.attr.attr_storage.max_logical_replication_workers; slot++) { + LogicalRepWorker *worker = &t_thrd.applylauncher_cxt.applyLauncherShm->workers[slot]; + + rc = memset_s(worker, sizeof(LogicalRepWorker), 0, sizeof(LogicalRepWorker)); + securec_check(rc, "", ""); + SpinLockInit(&worker->relmutex); + } } } @@ -689,7 +757,7 @@ void ApplyLauncherMain() } LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); - w = logicalrep_worker_find(sub->oid); + w = logicalrep_worker_find(sub->oid, InvalidOid, false); LWLockRelease(LogicalRepWorkerLock); /* Add to pending list if the subscription has no work attached */ @@ -706,7 +774,7 @@ void ApplyLauncherMain() foreach(lc, pendingSubList) { Subscription *readyToLaunchSub = (Subscription*)lfirst(lc); logicalrep_worker_launch(readyToLaunchSub->dbid, readyToLaunchSub->oid, - readyToLaunchSub->name, readyToLaunchSub->owner); + readyToLaunchSub->name, readyToLaunchSub->owner, InvalidOid); last_start_time = now; wait_time = wal_retrieve_retry_interval; } @@ -817,6 +885,10 @@ Datum pg_stat_get_subscription(PG_FUNCTION_ARGS) securec_check(rc, "", ""); values[idx++] = ObjectIdGetDatum(worker.subid); + if (OidIsValid(worker.relid)) + values[idx++] = ObjectIdGetDatum(worker.relid); + else + nulls[idx++] = true; values[idx++] = Int32GetDatum(worker_pid); if (XLogRecPtrIsInvalid(worker.last_lsn)) nulls[idx++] = true; diff --git a/src/gausskernel/storage/replication/logical/origin.cpp b/src/gausskernel/storage/replication/logical/origin.cpp index 15fa6cd43c..8c087df2f3 100644 --- a/src/gausskernel/storage/replication/logical/origin.cpp +++ b/src/gausskernel/storage/replication/logical/origin.cpp @@ -72,6 +72,7 @@ #include "funcapi.h" #include "miscadmin.h" +#include "pgstat.h" #include "access/genam.h" #include "access/heapam.h" @@ -269,12 +270,17 @@ RepOriginId replorigin_create(const char *roname) /* * Helper function to drop a replication origin. */ -static void replorigin_drop_guts(Relation rel, RepOriginId roident) +static void replorigin_drop_guts(Relation rel, RepOriginId roident, bool nowait) { HeapTuple tuple = NULL; ScanKeyData key; int i; +restart: + tuple = NULL; + + CHECK_FOR_INTERRUPTS(); + /* cleanup the slot state info */ LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); @@ -284,9 +290,23 @@ static void replorigin_drop_guts(Relation rel, RepOriginId roident) /* found our slot */ if (state->roident == roident) { if (state->acquired_by != 0) { - ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), - errmsg("could not drop replication origin with OID %d, in use by PID %lu", state->roident, - state->acquired_by))); + if (nowait) { + ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("could not drop replication origin with OID %d, in use by PID %lu", state->roident, + state->acquired_by))); + } + + LWLockRelease(ReplicationOriginLock); + pthread_mutex_lock(&state->originMutex); + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += SECS_PER_MINUTE / 2; + pgstat_report_waitevent(WAIT_EVENT_REPLICATION_ORIGIN_DROP); + (void)pthread_cond_timedwait(&state->orginCV, &state->originMutex, &ts); + pgstat_report_waitevent(WAIT_EVENT_END); + pthread_mutex_unlock(&state->originMutex); + + goto restart; } /* first WAL log */ @@ -329,7 +349,7 @@ static void replorigin_drop_guts(Relation rel, RepOriginId roident) * * Needs to be called in a transaction. */ -void replorigin_drop_by_name(const char *name, bool missing_ok) +void replorigin_drop_by_name(const char *name, bool missing_ok, bool nowait) { RepOriginId roident; Relation rel; @@ -349,7 +369,7 @@ void replorigin_drop_by_name(const char *name, bool missing_ok) roident = replorigin_by_name(name, missing_ok); if (OidIsValid(roident)) { - replorigin_drop_guts(rel, roident); + replorigin_drop_guts(rel, roident, nowait); } /* We keep the lock on pg_replication_origin until commit */ @@ -447,8 +467,23 @@ void ReplicationOriginShmemInit(void) u_sess->reporigin_cxt.repStatesShm->tranche_id = LWTRANCHE_REPLICATION_ORIGIN; for (i = 0; i < g_instance.attr.attr_storage.max_replication_slots; i++) { - LWLockInitialize(&u_sess->reporigin_cxt.repStatesShm->states[i].lock, - u_sess->reporigin_cxt.repStatesShm->tranche_id); + ReplicationState* state = &u_sess->reporigin_cxt.repStatesShm->states[i]; + + rc = pthread_condattr_init(&state->originAttr); + if (rc != 0) { + elog(FATAL, "Fail to init conattr for replication origin"); + } + rc = pthread_condattr_setclock(&state->originAttr, CLOCK_MONOTONIC); + if (rc != 0) { + elog(FATAL, "Fail to setclock replication origin"); + } + rc = pthread_cond_init(&state->orginCV, &state->originAttr); + if (rc != 0) { + elog(FATAL, "Fail to init cond for replication origin"); + } + state->originMutex = PTHREAD_MUTEX_INITIALIZER; + + LWLockInitialize(&state->lock, u_sess->reporigin_cxt.repStatesShm->tranche_id); } } } @@ -917,15 +952,25 @@ static XLogRecPtr replorigin_get_progress(RepOriginId node, bool flush) */ static void ReplicationOriginExitCleanup(int code, Datum arg) { + pthread_cond_t* cv = NULL; + pthread_mutex_t* mutex = NULL; + LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); if (u_sess->reporigin_cxt.curRepState != NULL && u_sess->reporigin_cxt.curRepState->acquired_by == t_thrd.proc_cxt.MyProcPid) { + cv = &u_sess->reporigin_cxt.curRepState->orginCV; + mutex = &u_sess->reporigin_cxt.curRepState->originMutex; u_sess->reporigin_cxt.curRepState->acquired_by = 0; u_sess->reporigin_cxt.curRepState = NULL; } LWLockRelease(ReplicationOriginLock); + if (cv) { + pthread_mutex_lock(mutex); + pthread_cond_signal(cv); + pthread_mutex_unlock(mutex); + } } /* @@ -1000,6 +1045,11 @@ void replorigin_session_setup(RepOriginId node) u_sess->reporigin_cxt.curRepState->acquired_by = t_thrd.proc_cxt.MyProcPid; LWLockRelease(ReplicationOriginLock); + + /* probably this one is pointless */ + pthread_mutex_lock(&u_sess->reporigin_cxt.curRepState->originMutex); + pthread_cond_signal(&u_sess->reporigin_cxt.curRepState->orginCV); + pthread_mutex_unlock(&u_sess->reporigin_cxt.curRepState->originMutex); } /* @@ -1010,6 +1060,8 @@ void replorigin_session_setup(RepOriginId node) */ static void replorigin_session_reset(void) { + pthread_cond_t* cv = NULL; + pthread_mutex_t* mutex = NULL; Assert(g_instance.attr.attr_storage.max_replication_slots != 0); if (u_sess->reporigin_cxt.curRepState == NULL) @@ -1019,9 +1071,16 @@ static void replorigin_session_reset(void) LWLockAcquire(ReplicationOriginLock, LW_EXCLUSIVE); u_sess->reporigin_cxt.curRepState->acquired_by = 0; + cv = &u_sess->reporigin_cxt.curRepState->orginCV; + mutex = &u_sess->reporigin_cxt.curRepState->originMutex; u_sess->reporigin_cxt.curRepState = NULL; LWLockRelease(ReplicationOriginLock); + if (cv && mutex) { + pthread_mutex_lock(mutex); + pthread_cond_signal(cv); + pthread_mutex_unlock(mutex); + } } /* @@ -1108,7 +1167,7 @@ Datum pg_replication_origin_drop(PG_FUNCTION_ARGS) name = text_to_cstring((text *)DatumGetPointer(PG_GETARG_DATUM(0))); - replorigin_drop_by_name(name, false); + replorigin_drop_by_name(name, false, true); pfree(name); diff --git a/src/gausskernel/storage/replication/logical/proto.cpp b/src/gausskernel/storage/replication/logical/proto.cpp index 2c537dd4c6..45c4df86ab 100644 --- a/src/gausskernel/storage/replication/logical/proto.cpp +++ b/src/gausskernel/storage/replication/logical/proto.cpp @@ -46,6 +46,9 @@ void logicalrep_write_begin(StringInfo out, ReorderBufferTXN *txn) pq_sendint64(out, txn->final_lsn); pq_sendint64(out, txn->commit_time); pq_sendint64(out, txn->xid); + if (t_thrd.walsender_cxt.isUseSnapshot) { + pq_sendint64(out, txn->csn); + } } /* @@ -59,6 +62,9 @@ void logicalrep_read_begin(StringInfo in, LogicalRepBeginData *begin_data) elog(ERROR, "final_lsn not set in begin message"); begin_data->committime = pq_getmsgint64(in); begin_data->xid = pq_getmsgint64(in); + if (t_thrd.applyworker_cxt.curWorker != NULL && AM_TABLESYNC_WORKER) { + begin_data->csn = pq_getmsgint64(in); + } } diff --git a/src/gausskernel/storage/replication/logical/relation.cpp b/src/gausskernel/storage/replication/logical/relation.cpp index 6a1a4efd09..ed1fc0e609 100644 --- a/src/gausskernel/storage/replication/logical/relation.cpp +++ b/src/gausskernel/storage/replication/logical/relation.cpp @@ -24,6 +24,7 @@ #include "replication/logicalrelation.h" #include "replication/worker_internal.h" #include "utils/inval.h" +#include "catalog/pg_subscription_rel.h" static const int DEFAULT_LOGICAL_RELMAP_HASH_ELEM = 128; /* @@ -331,6 +332,11 @@ LogicalRepRelMapEntry *logicalrep_rel_open(LogicalRepRelId remoteid, LOCKMODE lo entry->localrelvalid = true; } + if (entry->state != SUBREL_STATE_READY) + entry->state = GetSubscriptionRelState(t_thrd.applyworker_cxt.mySubscription->oid, + entry->localreloid, + &entry->statelsn); + return entry; } diff --git a/src/gausskernel/storage/replication/logical/snapbuild.cpp b/src/gausskernel/storage/replication/logical/snapbuild.cpp index 025f1626e3..6d376766e3 100644 --- a/src/gausskernel/storage/replication/logical/snapbuild.cpp +++ b/src/gausskernel/storage/replication/logical/snapbuild.cpp @@ -127,6 +127,7 @@ #include "access/transam.h" #include "access/xact.h" #include "access/xlog_internal.h" +#include "access/csnlog.h" #include "replication/logical.h" #include "replication/reorderbuffer.h" @@ -257,7 +258,7 @@ struct SnapBuild { static void SnapBuildPurgeCommittedTxn(SnapBuild *builder); /* snapshot building/manipulation/distribution functions */ -static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid); +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder); static void SnapBuildFreeSnapshot(Snapshot snap); @@ -442,7 +443,7 @@ void SnapBuildSnapDecRefcount(Snapshot snap) * these snapshots; they have to copy them and fill in appropriate ->curcid * and ->subxip/subxcnt values. */ -static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid) +static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder) { Snapshot snapshot; Size ssize; @@ -513,55 +514,39 @@ static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder, TransactionId xid) } /* - * Export a snapshot so it can be set in another session with SET TRANSACTION - * SNAPSHOT. - * - * For that we need to start a transaction in the current backend as the - * importing side checks whether the source transaction is still open to make - * sure the xmin horizon hasn't advanced since then. + * Build the initial slot snapshot and convert it to normal snapshot that + * is understood by HeapTupleSatisfiesMVCC. * - * After that we convert a locally built snapshot into the normal variant - * understood by HeapTupleSatisfiesMVCC et al. + * The snapshot will be usable directly in current transaction or exported + * for loading in different transaction. */ -const char *SnapBuildExportSnapshot(SnapBuild *builder) +Snapshot SnapBuildInitialSnapshot(SnapBuild *builder) { Snapshot snap = NULL; - char *snapname = NULL; TransactionId *newxip = NULL; const int newxcnt = 0; int maxcnt = GetMaxSnapshotXidCount(); + uint32 idx; + CommitSeqNo snapshotcsn = InvalidCommitSeqNo; + TransactionId tempXid; + + Assert(!u_sess->utils_cxt.FirstSnapshotSet); + Assert(u_sess->utils_cxt.XactIsoLevel = XACT_REPEATABLE_READ); if (builder->state != SNAPBUILD_CONSISTENT) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot export a snapshot before reaching a consistent state"))); + errmsg("cannot build an initial slot snapshot before reaching a consistent state"))); if (!builder->committed.includes_all_transactions) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot export a snapshot, not all transactions are monitored anymore"))); + errmsg("cannot build an initial slot snapshot, not all transactions are monitored anymore"))); /* so we don't overwrite the existing value */ if (TransactionIdIsValid(t_thrd.pgxact->xmin)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot export a snapshot when MyPgXact->xmin already is valid"))); - if (IsTransactionOrTransactionBlock()) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot export a snapshot from within a transaction"))); - - if (t_thrd.logical_cxt.SavedResourceOwnerDuringExport) - ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("can only export one snapshot at a time"))); - - t_thrd.logical_cxt.SavedResourceOwnerDuringExport = t_thrd.utils_cxt.CurrentResourceOwner; - t_thrd.logical_cxt.ExportInProgress = true; - - StartTransactionCommand(); - - Assert(!u_sess->utils_cxt.FirstSnapshotSet); - - /* There doesn't seem to a nice API to set these */ - u_sess->utils_cxt.XactIsoLevel = XACT_REPEATABLE_READ; - u_sess->attr.attr_common.XactReadOnly = true; + errmsg("cannot build an initial slot snapshot when MyPgXact->xmin already is valid"))); - snap = SnapBuildBuildSnapshot(builder, GetTopTransactionId()); + snap = SnapBuildBuildSnapshot(builder); /* * We know that snap->xmin is alive, enforced by the logical xmin @@ -573,6 +558,15 @@ const char *SnapBuildExportSnapshot(SnapBuild *builder) if (snap->xcnt > (uint32)maxcnt) maxcnt = snap->xcnt; + tempXid = snap->xmin; + while (tempXid > FirstNormalTransactionId) { + TransactionIdRetreat(tempXid); + snapshotcsn = CSNLogGetCommitSeqNo(tempXid); + + if (COMMITSEQNO_IS_COMMITTED(snapshotcsn) && !COMMITSEQNO_IS_FROZEN(snapshotcsn)) { + break; + } + } /* allocate in transaction context */ newxip = (TransactionId *)palloc(sizeof(TransactionId) * maxcnt); @@ -582,12 +576,56 @@ const char *SnapBuildExportSnapshot(SnapBuild *builder) * classical snapshot by marking all non-committed transactions as * in-progress. This can be expensive. */ + for (idx = 0; idx < snap->xcnt; idx++) { + CommitSeqNo csn = CSNLogGetCommitSeqNo(snap->xip[idx]); + if (snapshotcsn == InvalidCommitSeqNo || snapshotcsn < csn) { + snapshotcsn = csn; + } + } + snap->xcnt = newxcnt; snap->xip = newxip; + snap->snapshotcsn = snapshotcsn; + + return snap; +} + +/* + * Export a snapshot so it can be set in another session with SET TRANSACTION + * SNAPSHOT. + * + * For that we need to start a transaction in the current backend as the + * importing side checks whether the source transaction is still open to make + * sure the xmin horizon hasn't advanced since then. + */ +const char *SnapBuildExportSnapshot(SnapBuild *builder) +{ + Snapshot snap; + char *snapname; + + if (IsTransactionOrTransactionBlock()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot export a snapshot from within a transaction"))); + + if (t_thrd.logical_cxt.SavedResourceOwnerDuringExport) + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("can only export one snapshot at a time"))); + + t_thrd.logical_cxt.SavedResourceOwnerDuringExport = t_thrd.utils_cxt.CurrentResourceOwner; + t_thrd.logical_cxt.ExportInProgress = true; + + StartTransactionCommand(); + + Assert(!u_sess->utils_cxt.FirstSnapshotSet); + + /* There doesn't seem to a nice API to set these */ + u_sess->utils_cxt.XactIsoLevel = XACT_REPEATABLE_READ; + u_sess->attr.attr_common.XactReadOnly = true; + + snap = SnapBuildInitialSnapshot(builder); /* - * now that we've built a plain snapshot, use the normal mechanisms for - * exporting it + * now that we've built a plain snapshot, make it active and use the + * normal mechanisms for exporting it. */ snapname = ExportSnapshot(snap, NULL); if (!RecoveryInProgress()) @@ -647,7 +685,7 @@ bool SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr ls if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) { /* only build a new snapshot if we don't have a prebuilt one */ if (builder->snapshot == NULL) { - builder->snapshot = SnapBuildBuildSnapshot(builder, xid); + builder->snapshot = SnapBuildBuildSnapshot(builder); /* inrease refcount for the snapshot builder */ SnapBuildSnapIncRefcount(builder->snapshot); } @@ -934,7 +972,7 @@ void SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, i if (builder->snapshot != NULL) { SnapBuildSnapDecRefcount(builder->snapshot); } - builder->snapshot = SnapBuildBuildSnapshot(builder, xid); + builder->snapshot = SnapBuildBuildSnapshot(builder); /* we might need to execute invalidations, add snapshot */ if (!ReorderBufferXidHasBaseSnapshot(builder->reorder, xid)) { @@ -1646,7 +1684,7 @@ static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) if (builder->snapshot != NULL) { SnapBuildSnapDecRefcount(builder->snapshot); } - builder->snapshot = SnapBuildBuildSnapshot(builder, InvalidTransactionId); + builder->snapshot = SnapBuildBuildSnapshot(builder); SnapBuildSnapIncRefcount(builder->snapshot); ReorderBufferSetRestartPoint(builder->reorder, lsn); diff --git a/src/gausskernel/storage/replication/logical/tablesync.cpp b/src/gausskernel/storage/replication/logical/tablesync.cpp new file mode 100644 index 0000000000..1be6a90d56 --- /dev/null +++ b/src/gausskernel/storage/replication/logical/tablesync.cpp @@ -0,0 +1,1018 @@ +/*------------------------------------------------------------------------- + * tablesync.c + * PostgreSQL logical replication: initial table data synchronization + * + * Copyright (c) 2012-2016, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/backend/replication/logical/tablesync.c + * + * NOTES + * This file contains code for initial table data synchronization for + * logical replication. + * + * The initial data synchronization is done separately for each table, + * in separate apply worker that only fetches the initial snapshot data + * from the publisher and then synchronizes the position in stream with + * the main apply worker. + * + * The are several reasons for doing the synchronization this way: + * - It allows us to parallelize the initial data synchronization + * which lowers the time needed for it to happen. + * - The initial synchronization does not have to hold the xid and LSN + * for the time it takes to copy data of all tables, causing less + * bloat and lower disk consumption compared to doing the + * synchronization in single process for whole database. + * - It allows us to synchronize the tables added after the initial + * synchronization has finished. + * + * The stream position synchronization works in multiple steps: + * - Apply worker requests a tablesync worker to start, setting the new + * table state to INIT. + * - Tablesync worker starts; changes table state from INIT to DATASYNC while + * copying. + * - Tablesync worker does initial table copy; there is a FINISHEDCOPY (sync + * worker specific) state to indicate when the copy phase has completed, so + * if the worker crashes with this (non-memory) state then the copy will not + * be re-attempted. + * - Tablesync worker then sets table state to SYNCWAIT; waits for state change. + * - Apply worker periodically checks for tables in SYNCWAIT state. When + * any appear, it sets the table state to CATCHUP and starts loop-waiting + * until either the table state is set to SYNCDONE or the sync worker + * exits. + * - After the sync worker has seen the state change to CATCHUP, it will + * read the stream and apply changes (acting like an apply worker) until + * it catches up to the specified stream position. Then it sets the + * state to SYNCDONE. There might be zero changes applied between + * CATCHUP and SYNCDONE, because the sync worker might be ahead of the + * apply worker. + * - Once the state is set to SYNCDONE, the apply will continue tracking + * the table until it reaches the SYNCDONE stream position, at which + * point it sets state to READY and stops tracking. Again, there might + * be zero changes in between. + * + * So the state progression is always: INIT -> DATASYNC -> FINISHEDCOPY + * -> SYNCWAIT -> CATCHUP -> SYNCDONE -> READY. + * + * The catalog pg_subscription_rel is used to keep information about + * subscribed tables and their state. Some transient state during data + * synchronization is kept in shared memory. The states SYNCWAIT and + * CATCHUP only appear in memory. + * + * Example flows look like this: + * - Apply is in front: + * sync:8 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:10 + * -> set in memory CATCHUP + * -> enter wait-loop + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> exit wait-loop + * -> continue rep + * apply:11 + * -> set in catalog READY + * + * - Sync is in front: + * sync:10 + * -> set in catalog FINISHEDCOPY + * -> set in memory SYNCWAIT + * apply:8 + * -> set in memory CATCHUP + * -> continue per-table filtering + * sync:10 + * -> set in catalog SYNCDONE + * -> exit + * apply:10 + * -> set in catalog READY + * -> stop per-table filtering + * -> continue rep + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" + +#include "access/xact.h" + +#include "catalog/pg_subscription_rel.h" +#include "catalog/pg_type.h" + +#include "commands/copy.h" +#include "commands/subscriptioncmds.h" + +#include "replication/logicallauncher.h" +#include "replication/logicalrelation.h" +#include "replication/walreceiver.h" +#include "replication/worker_internal.h" + +#include "storage/ipc.h" +#include "storage/lmgr.h" + +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "access/tableam.h" +#include "libpq/libpq-fe.h" + +static void finish_sync_worker(char *slotName = NULL); + +/* + * Exit routine for synchronization worker. + */ +static void finish_sync_worker(char *slotName) +{ + /* + * Commit any outstanding transaction. This is the usual case, unless + * there was nothing to do for the table. + */ + if (IsTransactionState()) { + CommitTransactionCommand(); + pgstat_report_stat(false); + } + + /* And flush all writes. */ + XLogWaitFlush(GetXLogWriteRecPtr()); + + ereport(LOG, (errmsg("logical replication table synchronization worker for subscription \"%s\"," + " table \"%s\" has finished", t_thrd.applyworker_cxt.mySubscription->name, + get_rel_name(t_thrd.applyworker_cxt.curWorker->relid)))); + + /* Stop gracefully */ + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); + + /* Cleanup the tablesync slot. */ + if (slotName != NULL) { + if (!AttemptConnectPublisher(t_thrd.applyworker_cxt.mySubscription->conninfo, slotName, true)) { + ereport(ERROR, (errmsg("could not connect to the publisher: %s", + PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); + } + /* + * It is important to give an error if we are unable to drop the slot, + * otherwise, it won't be dropped till the corresponding subscription + * is dropped. So passing missing_ok = false. + */ + ReplicationSlotDropAtPubNode(slotName, false); + /* Stop gracefully */ + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_disconnect(); + } + + /* Find the main apply worker and signal it. */ + logicalrep_worker_wakeup(t_thrd.applyworker_cxt.curWorker->subid, InvalidOid); + + proc_exit(0); +} + +/* + * Wait until the relation sync state is set in catalog to the expected + * one; return true when it happens. + * + * Returns false if the table sync worker or the table itself have + * disappeared, or the table state has been reset. + * + * Currently, this is used in the apply worker when transitioning from + * CATCHUP state to SYNCDONE. + */ +static bool wait_for_relation_state_change(Oid relid, char expected_state) +{ + int rc; + char state; + + for (;;) { + LogicalRepWorker *worker; + XLogRecPtr statelsn; + + CHECK_FOR_INTERRUPTS(); + + state = GetSubscriptionRelState(t_thrd.applyworker_cxt.curWorker->subid, + relid, &statelsn); + + if (state == SUBREL_STATE_UNKNOWN) + break; + + if (state == expected_state) + return true; + + /* Check if the sync worker is still running and bail if not. */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(t_thrd.applyworker_cxt.curWorker->subid, relid, false); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) { + break; + } + + pgstat_report_waitevent(WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + rc = WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 10000L); + pgstat_report_waitevent(WAIT_EVENT_END); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&t_thrd.proc->procLatch); + } + + return false; +} + +/* + * Wait until the the apply worker changes the state of our synchronization + * worker to the expected one. + * + * Used when transitioning from SYNCWAIT state to CATCHUP. + * + * Returns false if the apply worker has disappeared. + */ +static bool wait_for_worker_state_change(char expected_state) +{ + int rc; + + for (;;) { + LogicalRepWorker *worker; + + CHECK_FOR_INTERRUPTS(); + + /* + * Done if already in correct state. (We assume this fetch is atomic + * enough to not give a misleading answer if we do it with no lock.) + */ + if (t_thrd.applyworker_cxt.curWorker->relstate == expected_state) + return true; + + /* + * Bail out if the apply worker has died, else signal it we're + * waiting. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + worker = logicalrep_worker_find(t_thrd.applyworker_cxt.curWorker->subid, InvalidOid, false); + if (worker && worker->proc) + logicalrep_worker_wakeup_ptr(worker); + LWLockRelease(LogicalRepWorkerLock); + if (!worker) + break; + + /* + * Wait. We expect to get a latch signal back from the apply worker, + * but use a timeout in case it dies without sending one. + */ + pgstat_report_waitevent(WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE); + rc = WaitLatch(&t_thrd.proc->procLatch, WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, 1000L); + pgstat_report_waitevent(WAIT_EVENT_END); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + if (rc & WL_LATCH_SET) + ResetLatch(&t_thrd.proc->procLatch); + } + + return false; +} + +/* + * Callback from syscache invalidation. + */ +void invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue) +{ + t_thrd.applyworker_cxt.tableStatesValid = false; +} + +/* + * Determine the tablesync slot name. + * + * The name must not exceed NAMEDATALEN - 1 because of remote node constraints + * on slot name length. We append system_identifier to avoid slot_name + * collision with subscriptions in other clusters. With the current scheme + * pg_%u_sync_%u_UINT64_FORMAT (3 + 10 + 6 + 10 + 20 + '\0'), the maximum + * length of slot_name will be 50. + * + * The returned slot name is stored in the supplied buffer (syncslotname) with + * the given size. + * + * Note: We don't use the subscription slot name as part of tablesync slot name + * because we are responsible for cleaning up these slots and it could become + * impossible to recalculate what name to cleanup if the subscription slot name + * had changed. + */ +void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot) +{ + int rc = snprintf_s(syncslotname, szslot, szslot - 1, "pg_%u_sync_%u_" UINT64_FORMAT, suboid, + relid, GetSystemIdentifier()); + securec_check_ss(rc, "\0", "\0"); +} + +/* + * Form the origin name for tablesync. + * + * Return the name in the supplied buffer. + */ +void ReplicationOriginNameForTablesync(Oid suboid, Oid relid, char *originname, int szorgname) +{ + int rc = snprintf_s(originname, szorgname, szorgname - 1, "pg_%u_%u", suboid, relid); + securec_check_ss(rc, "\0", "\0"); +} + +/* + * Handle table synchronization cooperation from the synchronization + * worker. + * + * If the sync worker is in CATCHUP state and reached (or passed) the + * predetermined synchronization point in the WAL stream, mark the table as + * SYNCDONE and finish. + */ +static void process_syncing_tables_for_sync(XLogRecPtr current_lsn) +{ + LogicalRepWorker* myWorker = t_thrd.applyworker_cxt.curWorker; + + SpinLockAcquire(&myWorker->relmutex); + + if (myWorker->relstate == SUBREL_STATE_CATCHUP && current_lsn >= myWorker->relstate_lsn) { + char syncslotname[NAMEDATALEN] = {0}; + + myWorker->relstate = SUBREL_STATE_SYNCDONE; + myWorker->relstate_lsn = current_lsn; + + SpinLockRelease(&myWorker->relmutex); + + /* + * UpdateSubscriptionRelState must be called within a transaction. + * That transaction will be ended within the finish_sync_worker(). + */ + if (!IsTransactionState()) + StartTransactionCommand(); + + UpdateSubscriptionRelState(myWorker->subid, myWorker->relid, myWorker->relstate, myWorker->relstate_lsn); + + ReplicationSlotNameForTablesync(myWorker->subid, myWorker->relid, syncslotname, sizeof(syncslotname)); + finish_sync_worker(syncslotname); + } else + SpinLockRelease(&myWorker->relmutex); +} + +/* + * Handle table synchronization cooperation from the apply worker. + * + * Walk over all subscription tables that are individually tracked by the + * apply process (currently, all that have state other than + * SUBREL_STATE_READY) and manage synchronization for them. + * + * If there are tables that need synchronizing and are not being synchronized + * yet, start sync workers for them (if there are free slots for sync + * workers). + * + * For tables that are being synchronized already, check if sync workers + * either need action from the apply worker or have finished. This is the + * SYNCWAIT to CATCHUP transition. + * + * If the synchronization position is reached (SYNCDONE), then the table can + * be marked as READY and is no longer tracked. + */ +static void process_syncing_tables_for_apply(XLogRecPtr current_lsn) +{ + ListCell *lc; + int rc; + bool started_tx = false; + + Assert(!IsTransactionState()); + + /* We need up to date sync state info for subscription tables here. */ + if (!t_thrd.applyworker_cxt.tableStatesValid) { + MemoryContext oldctx; + List *rstates; + ListCell *lc; + SubscriptionRelState *rstate; + + /* Clean the old list. */ + list_free_deep(t_thrd.applyworker_cxt.tableStates); + t_thrd.applyworker_cxt.tableStates = NIL; + + StartTransactionCommand(); + started_tx = true; + + /* Fetch all non-ready tables. */ + rstates = GetSubscriptionRelations(t_thrd.applyworker_cxt.mySubscription->oid, true); + + /* Allocate the tracking info in a permanent memory context. */ + oldctx = MemoryContextSwitchTo(t_thrd.applyworker_cxt.applyContext); + foreach (lc, rstates) { + rstate = (SubscriptionRelState *)palloc(sizeof(SubscriptionRelState)); + rc = memcpy_s(rstate, sizeof(SubscriptionRelState), lfirst(lc), sizeof(SubscriptionRelState)); + securec_check(rc, "\0", "\0"); + t_thrd.applyworker_cxt.tableStates = lappend(t_thrd.applyworker_cxt.tableStates, rstate); + } + MemoryContextSwitchTo(oldctx); + + t_thrd.applyworker_cxt.tableStatesValid = true; + } + + /* Process all tables that are being synchronized. */ + foreach (lc, t_thrd.applyworker_cxt.tableStates) { + SubscriptionRelState *rstate = (SubscriptionRelState *)lfirst(lc); + + if (rstate->state == SUBREL_STATE_SYNCDONE) { + /* + * Apply has caught up to the position where the table sync + * has finished. Time to mark the table as ready so that + * apply will just continue to replicate it normally. + */ + if (current_lsn >= rstate->lsn) { + char originname[NAMEDATALEN]; + + rstate->state = SUBREL_STATE_READY; + rstate->lsn = current_lsn; + if (!started_tx) { + StartTransactionCommand(); + started_tx = true; + } + + /* + * Remove the tablesync origin tracking if exists. + * + * The normal case origin drop is done here instead of in the + * process_syncing_tables_for_sync function because we don't + * allow to drop the origin till the process owning the origin + * is alive. + * + * There is a chance that the user is concurrently performing + * refresh for the subscription where we remove the table + * state and its origin and by this time the origin might be + * already removed. So passing missing_ok = true. + */ + ReplicationOriginNameForTablesync(t_thrd.applyworker_cxt.curWorker->subid, rstate->relid, originname, + sizeof(originname)); + replorigin_drop_by_name(originname, true, false); + + /* + * Update the state to READY only after the origin cleanup. + */ + UpdateSubscriptionRelState(t_thrd.applyworker_cxt.curWorker->subid, rstate->relid, + rstate->state, rstate->lsn); + } + } else { + LogicalRepWorker *syncworker; + + /* + * Look for a sync worker for this relation. + */ + LWLockAcquire(LogicalRepWorkerLock, LW_SHARED); + syncworker = logicalrep_worker_find(t_thrd.applyworker_cxt.curWorker->subid, rstate->relid, false); + if (syncworker) { + /* Found one, update our copy of its state */ + SpinLockAcquire(&syncworker->relmutex); + rstate->state = syncworker->relstate; + rstate->lsn = syncworker->relstate_lsn; + if (rstate->state == SUBREL_STATE_SYNCWAIT) { + /* + * Sync worker is waiting for apply. Tell sync worker it + * can catchup now. + */ + syncworker->relstate = SUBREL_STATE_CATCHUP; + syncworker->relstate_lsn = Max(syncworker->relstate_lsn, current_lsn); + } + SpinLockRelease(&syncworker->relmutex); + /* If we told worker to catch up, wait for it. */ + if (rstate->state == SUBREL_STATE_SYNCWAIT) { + /* Signal the sync worker, as it may be waiting for us. */ + if (syncworker->proc) + logicalrep_worker_wakeup_ptr(syncworker); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * Enter busy loop and wait for synchronization worker to + * reach expected state (or die trying). + */ + if (!started_tx) { + StartTransactionCommand(); + started_tx = true; + } + + wait_for_relation_state_change(rstate->relid, SUBREL_STATE_SYNCDONE); + } + else + LWLockRelease(LogicalRepWorkerLock); + } else { + /* + * If no sync worker for this table yet, could running sync + * workers for this subscription, while we have the lock, for + * later. + */ + int nsyncworkers = logicalrep_sync_worker_count(t_thrd.applyworker_cxt.curWorker->subid); + + /* Now safe to release the LWLock */ + LWLockRelease(LogicalRepWorkerLock); + + /* + * If there are free sync worker slot(s), start a new sync + * worker for the table. + */ + if (nsyncworkers < g_instance.attr.attr_storage.max_sync_workers_per_subscription) { + logicalrep_worker_launch(t_thrd.applyworker_cxt.curWorker->dbid, + t_thrd.applyworker_cxt.mySubscription->oid, + t_thrd.applyworker_cxt.mySubscription->name, + t_thrd.applyworker_cxt.curWorker->userid, + rstate->relid); + } + } + } + } + + if (started_tx) { + CommitTransactionCommand(); + pgstat_report_stat(false); + } +} + +/* + * Process possible change(s) of tables that are being synchronized. + */ +void process_syncing_tables(XLogRecPtr current_lsn) +{ + if (AM_TABLESYNC_WORKER) + process_syncing_tables_for_sync(current_lsn); + else + process_syncing_tables_for_apply(current_lsn); +} + +/* + * Create list of columns for COPY based on logical relation mapping. + */ +static List *make_copy_attnamelist(LogicalRepRelMapEntry *rel) +{ + List *attnamelist = NIL; + TupleDesc desc = RelationGetDescr(rel->localrel); + int i; + + for (i = 0; i < desc->natts; i++) { + int remoteattnum = rel->attrmap[i]; + + /* Skip dropped attributes. */ + if (desc->attrs[i]->attisdropped) + continue; + + /* Skip attributes that are missing on remote side. */ + if (remoteattnum < 0) + continue; + + attnamelist = lappend(attnamelist, makeString(rel->remoterel.attnames[remoteattnum])); + } + + return attnamelist; +} + +/* + * Data source callback for the COPY FROM, which reads from the remote + * connection and passes the data back to our local COPY. + */ +static int copy_read_data(CopyState cstate, void *outbuf, int minread, int maxread) +{ + int bytesread = 0; + int avail; + int rc; + StringInfo copybuf = t_thrd.applyworker_cxt.copybuf; + + /* If there are some leftover data from previous read, use them. */ + avail = copybuf->len - copybuf->cursor; + if (avail) { + if (avail > maxread) + avail = maxread; + rc = memcpy_s(outbuf, maxread, ©buf->data[copybuf->cursor], avail); + securec_check(rc, "\0", "\0"); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + while (maxread > 0 && bytesread < minread) { + int rc; + int len; + char *buf = NULL; + + + for (;;) { + /* Try read the data. */ + if ((WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_receive(0, NULL, &buf, &len)) { + CHECK_FOR_INTERRUPTS(); + + if (len == 0) + break; + else if (len < 0) + return bytesread; + else { + /* Process the data */ + copybuf->data = buf; + copybuf->len = len; + copybuf->cursor = 0; + + avail = copybuf->len - copybuf->cursor; + if (avail > maxread) + avail = maxread; + rc = memcpy_s(outbuf, maxread, ©buf->data[copybuf->cursor], avail); + securec_check(rc, "\0", "\0"); + outbuf = (void *)((char *)outbuf + avail); + copybuf->cursor += avail; + maxread -= avail; + bytesread += avail; + } + + if (maxread <= 0 || bytesread >= minread) + return bytesread; + } else { + if (len == 0) + break; + else if (len < 0) + return bytesread; + } + } + + /* + * Wait for more data or latch. + */ + pgstat_report_waitevent(WAIT_EVENT_LOGICAL_SYNC_DATA); + rc = WaitLatchOrSocket(&t_thrd.proc->procLatch, WL_SOCKET_READABLE | WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + u_sess->proc_cxt.MyProcPort->sock, 1000L); + pgstat_report_waitevent(WAIT_EVENT_END); + + /* Emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); + + ResetLatch(&t_thrd.proc->procLatch); + } + + return bytesread; +} + +/* + * Get information about remote relation in similar fashion the RELATION + * message provides during replication. + */ +static void fetch_remote_table_info(char *nspname, char *relname, LogicalRepRelation *lrel) +{ + WalRcvExecResult *res; + StringInfoData cmd; + TupleTableSlot *slot; + Oid tableRow[2] = {OIDOID, CHAROID}; + Oid attrRow[4] = {TEXTOID, OIDOID, INT4OID, BOOLOID}; + bool isnull; + int natt; + + lrel->nspname = nspname; + lrel->relname = relname; + + /* First fetch Oid and replica identity. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT c.oid, c.relreplident" + " FROM pg_catalog.pg_class c," + " pg_catalog.pg_namespace n" + " WHERE n.nspname = %s" + " AND c.relname = %s" + " AND c.relkind = 'r'", + quote_literal_cstr(nspname), quote_literal_cstr(relname)); + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec(cmd.data, 2, tableRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, + (errmsg("could not fetch table info for table \"%s.%s\" from publisher: %s", nspname, relname, res->err))); + + slot = MakeSingleTupleTableSlot(res->tupledesc); + if (!tuplestore_gettupleslot(res->tuplestore, true, false, slot)) + ereport(ERROR, (errmsg("table \"%s.%s\" not found on publisher", nspname, relname))); + + lrel->remoteid = DatumGetObjectId(tableam_tslot_getattr(slot, 1, &isnull)); + Assert(!isnull); + lrel->replident = DatumGetChar(tableam_tslot_getattr(slot, 2, &isnull)); + Assert(!isnull); + + ExecDropSingleTupleTableSlot(slot); + walrcv_clear_result(res); + + /* Now fetch columns. */ + resetStringInfo(&cmd); + appendStringInfo(&cmd, + "SELECT a.attname," + " a.atttypid," + " a.atttypmod," + " a.attnum = ANY(i.indkey)" + " FROM pg_catalog.pg_attribute a" + " LEFT JOIN pg_catalog.pg_index i" + " ON (i.indexrelid = pg_get_replica_identity_index(%u))" + " WHERE a.attnum > 0::pg_catalog.int2" + " AND NOT a.attisdropped" + " AND a.attrelid = %u" + " ORDER BY a.attnum", + lrel->remoteid, lrel->remoteid); + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec(cmd.data, 4, attrRow); + + if (res->status != WALRCV_OK_TUPLES) + ereport(ERROR, (errmsg("could not fetch table info for table \"%s.%s\": %s", nspname, relname, res->err))); + + /* We don't know number of rows coming, so allocate enough space. */ + lrel->attnames = (char**)palloc0(MaxTupleAttributeNumber * sizeof(char *)); + lrel->atttyps = (Oid*)palloc0(MaxTupleAttributeNumber * sizeof(Oid)); + lrel->attkeys = NULL; + + natt = 0; + slot = MakeSingleTupleTableSlot(res->tupledesc); + while (tuplestore_gettupleslot(res->tuplestore, true, false, slot)) { + lrel->attnames[natt] = pstrdup(TextDatumGetCString(tableam_tslot_getattr(slot, 1, &isnull))); + Assert(!isnull); + lrel->atttyps[natt] = DatumGetObjectId(tableam_tslot_getattr(slot, 2, &isnull)); + Assert(!isnull); + if (DatumGetBool(tableam_tslot_getattr(slot, 4, &isnull))) + lrel->attkeys = bms_add_member(lrel->attkeys, natt); + + /* Should never happen. */ + if (++natt >= MaxTupleAttributeNumber) + elog(ERROR, "too many columns in remote table \"%s.%s\"", nspname, relname); + + ExecClearTuple(slot); + } + ExecDropSingleTupleTableSlot(slot); + + lrel->natts = natt; + + walrcv_clear_result(res); + pfree(cmd.data); +} + +/* + * Copy existing data of a table from publisher. + * + * Caller is responsible for locking the local relation. + */ +static void copy_table(Relation rel) +{ + LogicalRepRelMapEntry *relmapentry; + LogicalRepRelation lrel; + WalRcvExecResult *res; + StringInfoData cmd; + CopyState cstate; + List *attnamelist; + AdaptMem mem_info; + mem_info.max_mem = 0; + mem_info.work_mem = 0; + + /* Get the publisher relation info. */ + fetch_remote_table_info(get_namespace_name(RelationGetNamespace(rel)), RelationGetRelationName(rel), &lrel); + + /* Put the relation into relmap. */ + logicalrep_relmap_update(&lrel); + + /* Map the publisher relation to local one. */ + relmapentry = logicalrep_rel_open(lrel.remoteid, NoLock); + Assert(rel == relmapentry->localrel); + + /* Start copy on the publisher. */ + initStringInfo(&cmd); + appendStringInfo(&cmd, "COPY %s TO STDOUT", quote_qualified_identifier(lrel.nspname, lrel.relname)); + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec(cmd.data, 0, NULL); + pfree(cmd.data); + if (res->status != WALRCV_OK_COPY_OUT) + ereport(ERROR, (errmsg("could not start initial contents copy for table \"%s.%s\": %s", lrel.nspname, + lrel.relname, res->err))); + walrcv_clear_result(res); + + t_thrd.applyworker_cxt.copybuf = makeStringInfo(); + + /* Create CopyState for ingestion of the data from publisher. */ + attnamelist = make_copy_attnamelist(relmapentry); + cstate = BeginCopyFrom(rel, NULL, attnamelist, NIL, &mem_info, (const char*)cmd.data, copy_read_data); + + /* Do the copy */ + (void)CopyFrom(cstate); + + logicalrep_rel_close(relmapentry, NoLock); +} + +/* + * Start syncing the table in the sync worker. + * + * If nothing needs to be done to sync the table, we exit the worker without + * any further action. + */ +char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos) +{ + char *slotname; + char relstate; + XLogRecPtr relstate_lsn; + LibpqrcvConnectParam options; + CommitSeqNo csn = InvalidCommitSeqNo; + int rc; + Relation rel; + WalRcvExecResult *res; + char originname[NAMEDATALEN]; + RepOriginId originid; + + /* Check the state of the table synchronization. */ + StartTransactionCommand(); + relstate = GetSubscriptionRelState(t_thrd.applyworker_cxt.curWorker->subid, + t_thrd.applyworker_cxt.curWorker->relid, + &relstate_lsn, &csn); + CommitTransactionCommand(); + + SpinLockAcquire(&t_thrd.applyworker_cxt.curWorker->relmutex); + t_thrd.applyworker_cxt.curWorker->relstate = relstate; + t_thrd.applyworker_cxt.curWorker->relstate_lsn = relstate_lsn; + t_thrd.applyworker_cxt.curWorker->relcsn = csn; + SpinLockRelease(&t_thrd.applyworker_cxt.curWorker->relmutex); + + /* + * If synchronization is already done or no longer necessary, exit now + * that we've updated shared memory state. + */ + switch (relstate) { + case SUBREL_STATE_SYNCDONE: + case SUBREL_STATE_READY: + case SUBREL_STATE_UNKNOWN: + finish_sync_worker(); /* doesn't return */ + } + + /* Calculate the name of the tablesync slot. */ + slotname = (char *)palloc(NAMEDATALEN); + ReplicationSlotNameForTablesync(t_thrd.applyworker_cxt.mySubscription->oid, + t_thrd.applyworker_cxt.curWorker->relid, slotname, NAMEDATALEN); + + if (!AttemptConnectPublisher(t_thrd.applyworker_cxt.mySubscription->conninfo, slotname, true)) { + ereport(ERROR, (errmsg("could not connect to the publisher: %s", + PQerrorMessage(t_thrd.libwalreceiver_cxt.streamConn)))); + } + + Assert(t_thrd.applyworker_cxt.curWorker->relstate == SUBREL_STATE_INIT || + t_thrd.applyworker_cxt.curWorker->relstate == SUBREL_STATE_DATASYNC || + t_thrd.applyworker_cxt.curWorker->relstate == SUBREL_STATE_FINISHEDCOPY); + + /* Assign the origin tracking record name. */ + ReplicationOriginNameForTablesync(t_thrd.applyworker_cxt.mySubscription->oid, + t_thrd.applyworker_cxt.curWorker->relid, originname, sizeof(originname)); + + if (t_thrd.applyworker_cxt.curWorker->relstate == SUBREL_STATE_DATASYNC) { + /* + * We have previously errored out before finishing the copy so the + * replication slot might exist. We want to remove the slot if it + * already exists and proceed. + * + * XXX We could also instead try to drop the slot, last time we failed + * but for that, we might need to clean up the copy state as it might + * be in the middle of fetching the rows. Also, if there is a network + * breakdown then it wouldn't have succeeded so trying it next time + * seems like a better bet. + */ + ReplicationSlotDropAtPubNode(slotname, true); + } else if (t_thrd.applyworker_cxt.curWorker->relstate == SUBREL_STATE_FINISHEDCOPY) { + /* + * The COPY phase was previously done, but tablesync then crashed + * before it was able to finish normally. + */ + StartTransactionCommand(); + + /* + * The origin tracking name must already exist. It was created first + * time this tablesync was launched. + */ + originid = replorigin_by_name(originname, false); + replorigin_session_setup(originid); + u_sess->reporigin_cxt.originId = originid; + *origin_startpos = replorigin_session_get_progress(false); + + CommitTransactionCommand(); + + goto copy_table_done; + } + + SpinLockAcquire(&t_thrd.applyworker_cxt.curWorker->relmutex); + t_thrd.applyworker_cxt.curWorker->relstate = SUBREL_STATE_DATASYNC; + t_thrd.applyworker_cxt.curWorker->relstate_lsn = InvalidXLogRecPtr; + SpinLockRelease(&t_thrd.applyworker_cxt.curWorker->relmutex); + + /* Update the state and make it visible to others. */ + StartTransactionCommand(); + UpdateSubscriptionRelState(t_thrd.applyworker_cxt.curWorker->subid, + t_thrd.applyworker_cxt.curWorker->relid, + t_thrd.applyworker_cxt.curWorker->relstate, + t_thrd.applyworker_cxt.curWorker->relstate_lsn); + CommitTransactionCommand(); + pgstat_report_stat(false); + + StartTransactionCommand(); + + /* + * Use standard write lock here. It might be better to + * disallow access to table while it's being synchronized. + * But we don't want to block the main apply process from + * working and it has to open relation in RowExclusiveLock + * when remapping remote relation id to local one. + */ + rel = heap_open(t_thrd.applyworker_cxt.curWorker->relid, RowExclusiveLock); + + /* + * Start a transaction in the remote node in REPEATABLE READ mode. This + * ensures that both the replication slot we create (see below) and the + * COPY are consistent with each other. + */ + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec("BEGIN READ ONLY ISOLATION LEVEL " + "REPEATABLE READ", + 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, (errmsg("table copy could not start transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + /* + * Create a new permanent logical decoding slot. This slot will be used + * for the catchup phase after COPY is done, so tell it to use the + * snapshot to make the final data consistent. + */ + rc = memset_s(&options, sizeof(LibpqrcvConnectParam), 0, sizeof(LibpqrcvConnectParam)); + securec_check(rc, "", ""); + options.logical = true; + options.slotname = slotname; + options.useSnapshot = true; + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_create_slot(&options, origin_startpos, &csn); + + /* + * Setup replication origin tracking. The purpose of doing this before the + * copy is to avoid doing the copy again due to any error in setting up + * origin tracking. + */ + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) { + /* + * Origin tracking does not exist, so create it now. + * + * Then advance to the LSN got from walrcv_create_slot. This is WAL + * logged for the purpose of recovery. Locks are to prevent the + * replication origin from vanishing while advancing. + */ + originid = replorigin_create(originname); + + LockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + replorigin_advance(originid, *origin_startpos, InvalidXLogRecPtr, true /* go backward */, true /* WAL log */); + UnlockRelationOid(ReplicationOriginRelationId, RowExclusiveLock); + + replorigin_session_setup(originid); + u_sess->reporigin_cxt.originId = originid; + } else { + ereport(ERROR, + (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("replication origin \"%s\" already exists", originname))); + } + + /* Now do the initial data copy */ + PushActiveSnapshot(GetTransactionSnapshot()); + copy_table(rel); + PopActiveSnapshot(); + + res = (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_exec("COMMIT", 0, NULL); + if (res->status != WALRCV_OK_COMMAND) + ereport(ERROR, (errmsg("table copy could not finish transaction on publisher"), + errdetail("The error was: %s", res->err))); + walrcv_clear_result(res); + + heap_close(rel, NoLock); + + /* Make the copy visible. */ + CommandCounterIncrement(); + + /* + * Update the persisted state to indicate the COPY phase is done; make it + * visible to others. + */ + UpdateSubscriptionRelState(t_thrd.applyworker_cxt.curWorker->subid, + t_thrd.applyworker_cxt.curWorker->relid, + SUBREL_STATE_FINISHEDCOPY, + t_thrd.applyworker_cxt.curWorker->relstate_lsn, + csn); + + CommitTransactionCommand(); + +copy_table_done: + + ereport(DEBUG1, (errmsg("LogicalRepSyncTableStart: '%s' origin_startpos lsn %X/%X", originname, + (uint32)(*origin_startpos >> 32), (uint32)*origin_startpos))); + + /* + * We are done with the initial data synchronization, + * update the state. + */ + SpinLockAcquire(&t_thrd.applyworker_cxt.curWorker->relmutex); + t_thrd.applyworker_cxt.curWorker->relstate = SUBREL_STATE_SYNCWAIT; + t_thrd.applyworker_cxt.curWorker->relstate_lsn = *origin_startpos; + t_thrd.applyworker_cxt.curWorker->relcsn = csn; + SpinLockRelease(&t_thrd.applyworker_cxt.curWorker->relmutex); + + /* + * Finally, wait until the main apply worker tells us to catch up and then + * return to let LogicalRepApplyLoop do it. + */ + wait_for_worker_state_change(SUBREL_STATE_CATCHUP); + + return slotname; +} diff --git a/src/gausskernel/storage/replication/logical/worker.cpp b/src/gausskernel/storage/replication/logical/worker.cpp index 4bbfc62b6c..05272c9413 100644 --- a/src/gausskernel/storage/replication/logical/worker.cpp +++ b/src/gausskernel/storage/replication/logical/worker.cpp @@ -39,6 +39,7 @@ #include "catalog/namespace.h" #include "catalog/pg_subscription.h" #include "catalog/pg_partition_fn.h" +#include "catalog/pg_subscription_rel.h" #include "commands/trigger.h" #include "commands/subscriptioncmds.h" @@ -115,6 +116,30 @@ static void apply_dispatch(StringInfo s); static void apply_handle_conninfo(StringInfo s); static void UpdateConninfo(char* standbysInfo); +/* + * Should this worker apply changes for given relation. + * + * This is mainly needed for initial relation data sync as that runs in + * separate worker process running in parallel and we need some way to skip + * changes coming to the main apply worker during the sync of a table. + * + * Note we need to do smaller or equals comparison for SYNCDONE state because + * it might hold position of end of intitial slot consistent point WAL + * record + 1 (ie start of next record) and next record can be COMMIT of + * transaction we are now processing (which is what we set remote_final_lsn + * to in apply_handle_begin). + */ +static bool should_apply_changes_for_rel(LogicalRepRelMapEntry *rel) +{ + if (AM_TABLESYNC_WORKER) + return (t_thrd.applyworker_cxt.curWorker->relid == rel->localreloid && + (COMMITSEQNO_IS_FROZEN(t_thrd.applyworker_cxt.curWorker->relcsn) || + t_thrd.applyworker_cxt.curRemoteCsn > t_thrd.applyworker_cxt.curWorker->relcsn)); + else + return (rel->state == SUBREL_STATE_READY || + (rel->state == SUBREL_STATE_SYNCDONE && rel->statelsn <= t_thrd.applyworker_cxt.remoteFinalLsn)); +} + /* SIGHUP: set flag to re-read config file at next convenient time */ static void LogicalrepWorkerSighub(SIGNAL_ARGS) { @@ -450,6 +475,8 @@ static void apply_handle_begin(StringInfo s) logicalrep_read_begin(s, &begin_data); t_thrd.applyworker_cxt.inRemoteTransaction = true; + t_thrd.applyworker_cxt.remoteFinalLsn = begin_data.final_lsn; + t_thrd.applyworker_cxt.curRemoteCsn = begin_data.csn; pgstat_report_activity(STATE_RUNNING, NULL); } @@ -463,6 +490,8 @@ static void apply_handle_commit(StringInfo s) logicalrep_read_commit(s, &commit_data); + Assert(commit_data.commit_lsn == t_thrd.applyworker_cxt.remoteFinalLsn); + if (IsTransactionState()) { /* * Update origin state so we can restart streaming from correct @@ -478,6 +507,9 @@ static void apply_handle_commit(StringInfo s) t_thrd.applyworker_cxt.inRemoteTransaction = false; + /* Process any tables that are being synchronized in parallel. */ + process_syncing_tables(commit_data.end_lsn); + pgstat_report_activity(STATE_IDLE, NULL); } @@ -490,7 +522,7 @@ static void apply_handle_origin(StringInfo s) * ORIGIN message can only come inside remote transaction and before * any actual writes. */ - if (!t_thrd.applyworker_cxt.inRemoteTransaction || IsTransactionState()) + if (!t_thrd.applyworker_cxt.inRemoteTransaction || (IsTransactionState() && !AM_TABLESYNC_WORKER)) ereport(ERROR, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("ORIGIN message sent out of order"))); } @@ -555,6 +587,14 @@ static void apply_handle_insert(StringInfo s) relid = logicalrep_read_insert(s, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Initialize the executor state. */ estate = create_estate_for_relation(rel); @@ -670,6 +710,14 @@ static void apply_handle_update(StringInfo s) relid = logicalrep_read_update(s, &has_oldtup, &oldtup, &newtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the update. */ check_relation_updatable(rel); @@ -780,6 +828,14 @@ static void apply_handle_delete(StringInfo s) relid = logicalrep_read_delete(s, &oldtup); rel = logicalrep_rel_open(relid, RowExclusiveLock); + if (!should_apply_changes_for_rel(rel)) { + /* + * The relation can't become interesting in the middle of the + * transaction so it's safe to unlock it. + */ + logicalrep_rel_close(rel, RowExclusiveLock); + return; + } /* Check if we can do the delete. */ check_relation_updatable(rel); @@ -982,7 +1038,11 @@ static void ApplyWorkerProcessMsg(char type, StringInfo s, XLogRecPtr *lastRcv) PrimaryKeepaliveMessage keepalive; pq_copymsgbytes(s, (char*)&keepalive, sizeof(PrimaryKeepaliveMessage)); - send_feedback(keepalive.walEnd, keepalive.replyRequested, false); + if (*lastRcv < keepalive.walEnd) { + *lastRcv = keepalive.walEnd; + } + + send_feedback(*lastRcv, keepalive.replyRequested, false); UpdateWorkerStats(*lastRcv, keepalive.sendTime, true); } } @@ -1040,9 +1100,8 @@ static inline void ProcessApplyWorkerInterrupts(void) /* * Apply main loop. */ -static void ApplyLoop(void) +static void LogicalRepApplyLoop(XLogRecPtr last_received) { - XLogRecPtr last_received = InvalidXLogRecPtr; bool ping_sent = false; TimestampTz last_recv_timestamp = GetCurrentTimestamp(); @@ -1109,11 +1168,13 @@ static void ApplyLoop(void) * If we didn't get any transactions for a while there might be * unconsumed invalidation messages in the queue, consume them now. */ - StartTransactionCommand(); + AcceptInvalidationMessages(); /* Check for subscription change */ if (!t_thrd.applyworker_cxt.mySubscriptionValid) reread_subscription(); - CommitTransactionCommand(); + + /* Process any table synchronization changes. */ + process_syncing_tables(last_received); } if (t_thrd.applyworker_cxt.got_SIGHUP) { @@ -1210,6 +1271,13 @@ static void reread_subscription(void) { MemoryContext oldctx; Subscription *newsub; + bool started_tx = false; + + /* This function might be called inside or outside of transaction. */ + if (!IsTransactionState()) { + StartTransactionCommand(); + started_tx = true; + } /* Ensure allocations in permanent context. */ oldctx = MemoryContextSwitchTo(t_thrd.applyworker_cxt.applyContext); @@ -1307,6 +1375,9 @@ static void reread_subscription(void) SetConfigOption("synchronous_commit", t_thrd.applyworker_cxt.mySubscription->synccommit, PGC_BACKEND, PGC_S_OVERRIDE); + if (started_tx) + CommitTransactionCommand(); + t_thrd.applyworker_cxt.mySubscriptionValid = true; } @@ -1324,8 +1395,8 @@ void ApplyWorkerMain() { MemoryContext oldctx; char originname[NAMEDATALEN]; - RepOriginId originid; XLogRecPtr origin_startpos; + char *myslotname; int rc = 0; LibpqrcvConnectParam options; @@ -1472,48 +1543,75 @@ void ApplyWorkerMain() /* Keep us informed about subscription changes. */ CacheRegisterThreadSyscacheCallback(SUBSCRIPTIONOID, subscription_change_cb, (Datum)0); - ereport(LOG, (errmsg("logical replication apply for worker subscription \"%s\" has started", - t_thrd.applyworker_cxt.mySubscription->name))); - - /* Setup replication origin tracking. */ - rc = sprintf_s(originname, sizeof(originname), "pg_%u", t_thrd.applyworker_cxt.mySubscription->oid); - securec_check_ss(rc, "", ""); - originid = replorigin_by_name(originname, true); - if (!OidIsValid(originid)) - originid = replorigin_create(originname); - replorigin_session_setup(originid); - u_sess->reporigin_cxt.originId = originid; - origin_startpos = replorigin_session_get_progress(false); + if (AM_TABLESYNC_WORKER) + ereport(LOG, (errmsg("logical replication table synchronization for subscription %s, table %s has started", + t_thrd.applyworker_cxt.mySubscription->name, get_rel_name(t_thrd.applyworker_cxt.curWorker->relid)))); + else + ereport(LOG, (errmsg("logical replication apply worker for subscription \"%s\" has started", + t_thrd.applyworker_cxt.mySubscription->name))); CommitTransactionCommand(); - if (!AttemptConnectPublisher(t_thrd.applyworker_cxt.mySubscription->conninfo, - t_thrd.applyworker_cxt.mySubscription->name, true)) { - ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("Failed to connect to publisher."))); + if (AM_TABLESYNC_WORKER) { + char *syncslotname; + + /* This is table synchroniation worker, call initial sync. */ + syncslotname = LogicalRepSyncTableStart(&origin_startpos); + + /* allocate slot name in long-lived context */ + myslotname = MemoryContextStrdup(t_thrd.applyworker_cxt.applyContext, syncslotname); + + pfree(syncslotname); + } else { + RepOriginId originid; + + myslotname = t_thrd.applyworker_cxt.mySubscription->slotname; + + /* Setup replication origin tracking. */ + StartTransactionCommand(); + rc = sprintf_s(originname, sizeof(originname), "pg_%u", t_thrd.applyworker_cxt.mySubscription->oid); + securec_check_ss(rc, "", ""); + originid = replorigin_by_name(originname, true); + if (!OidIsValid(originid)) + originid = replorigin_create(originname); + replorigin_session_setup(originid); + u_sess->reporigin_cxt.originId = originid; + origin_startpos = replorigin_session_get_progress(false); + CommitTransactionCommand(); + + if (!AttemptConnectPublisher(t_thrd.applyworker_cxt.mySubscription->conninfo, myslotname, true)) { + ereport(ERROR, (errcode(ERRCODE_CONNECTION_FAILURE), errmsg("Failed to connect to publisher."))); + } + + /* + * We don't really use the output identify_system for anything + * but it does some initializations on the upstream so let's still + * call it. + */ + (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_identify_system(); } /* - * We don't really use the output identify_system for anything - * but it does some initializations on the upstream so let's still - * call it. + * Setup callback for syscache so that we know when something + * changes in the subscription relation state. */ - (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_identify_system(); + CacheRegisterThreadSyscacheCallback(SUBSCRIPTIONRELMAP, invalidate_syncing_table_states, (Datum)0); /* Build logical replication streaming options. */ rc = memset_s(&options, sizeof(LibpqrcvConnectParam), 0, sizeof(LibpqrcvConnectParam)); securec_check(rc, "", ""); options.logical = true; options.startpoint = origin_startpos; - options.slotname = t_thrd.applyworker_cxt.mySubscription->slotname; + options.slotname = myslotname; options.protoVersion = LOGICALREP_PROTO_VERSION_NUM; options.publicationNames = t_thrd.applyworker_cxt.mySubscription->publications; options.binary = t_thrd.applyworker_cxt.mySubscription->binary; - /* Start streaming from the slot. */ + /* Start normal logical streaming replication. */ (WalReceiverFuncTable[GET_FUNC_IDX]).walrcv_startstreaming(&options); /* Run the main loop. */ - ApplyLoop(); + LogicalRepApplyLoop(origin_startpos); ereport(LOG, (errmsg("ApplyWorker: shutting down"))); proc_exit(0); diff --git a/src/gausskernel/storage/replication/repl_gram.y b/src/gausskernel/storage/replication/repl_gram.y index 1c65c3cfff..4c6e1101e6 100755 --- a/src/gausskernel/storage/replication/repl_gram.y +++ b/src/gausskernel/storage/replication/repl_gram.y @@ -73,6 +73,7 @@ %token SCONST IDENT %token RECPTR %token ICONST +%token T_WORD /* Keyword tokens. */ %token K_BASE_BACKUP @@ -102,15 +103,17 @@ %token K_PHYSICAL %token K_LOGICAL %token K_SLOT +%token K_USE_SNAPSHOT %type command -%type base_backup start_replication start_data_replication fetch_mot_checkpoint start_logical_replication advance_logical_replication identify_system identify_version identify_mode identify_consistence create_replication_slot drop_replication_slot identify_maxlsn identify_channel identify_az +%type base_backup start_replication start_data_replication fetch_mot_checkpoint start_logical_replication advance_logical_replication identify_system identify_version identify_mode identify_consistence create_replication_slot drop_replication_slot identify_maxlsn identify_channel identify_az sql_cmd %type base_backup_opt_list %type base_backup_opt %type plugin_options plugin_opt_list %type plugin_opt_elem %type plugin_opt_arg %type opt_slot +%type uses_napshot_opt %% firstcmd: command opt_semicolon @@ -139,6 +142,7 @@ command: | identify_maxlsn | identify_channel | identify_az + | sql_cmd ; /* @@ -366,16 +370,18 @@ advance_logical_replication: cmd->kind = REPLICATION_KIND_PHYSICAL; cmd->slotname = $2; cmd->init_slot_lsn = $4; + cmd->useSnapshot = false; $$ = (Node *) cmd; } /* CREATE_REPLICATION_SLOT slot LOGICAL plugin */ - | K_CREATE_REPLICATION_SLOT IDENT K_LOGICAL IDENT + | K_CREATE_REPLICATION_SLOT IDENT K_LOGICAL IDENT uses_napshot_opt { CreateReplicationSlotCmd *cmd; cmd = makeNode(CreateReplicationSlotCmd); cmd->kind = REPLICATION_KIND_LOGICAL; cmd->slotname = $2; cmd->plugin = $4; + cmd->useSnapshot = $5; $$ = (Node *) cmd; } ; @@ -428,6 +434,26 @@ plugin_opt_arg: SCONST { $$ = (Node *) makeString($1); } | /* EMPTY */ { $$ = NULL; } ; + +uses_napshot_opt: + K_USE_SNAPSHOT { $$ = true; } + | /* EMPTY */ { $$ = false; } + +sql_cmd: + IDENT + { + SQLCmd *cmd = makeNode(SQLCmd); + int tok; + + /* Just move lexer to the end of command. */ + for (;;) { + tok = yylex(&yylval, &yylloc, yyscanner); + if (tok == ';' || tok == 0) + break; + } + $$ = (Node *)cmd; + } + ; %% void diff --git a/src/gausskernel/storage/replication/repl_scanner.l b/src/gausskernel/storage/replication/repl_scanner.l index cb8ed016a9..adb0b87710 100755 --- a/src/gausskernel/storage/replication/repl_scanner.l +++ b/src/gausskernel/storage/replication/repl_scanner.l @@ -118,6 +118,7 @@ DROP_REPLICATION_SLOT { return K_DROP_REPLICATION_SLOT; } PHYSICAL { return K_PHYSICAL; } LOGICAL { return K_LOGICAL; } SLOT { return K_SLOT; } +USE_SNAPSHOT { return K_USE_SNAPSHOT; } "," { return ','; } ";" { return ';'; } @@ -199,9 +200,7 @@ SLOT { return K_SLOT; } } . { - ereport(ERROR, - (errcode(ERRCODE_SYNTAX_ERROR), - errmsg("syntax error: unexpected character \"%s\"", yytext))); + return T_WORD; } %% diff --git a/src/gausskernel/storage/replication/slot.cpp b/src/gausskernel/storage/replication/slot.cpp index 4555b4afc7..0785932f7a 100755 --- a/src/gausskernel/storage/replication/slot.cpp +++ b/src/gausskernel/storage/replication/slot.cpp @@ -473,13 +473,13 @@ void ReplicationSlotAcquire(const char *name, bool isDummyStandby, bool allowDro /* If we did not find the slot or it was already active, error out. */ if (slot == NULL) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("replication slot \"%s\" does not exist", name))); - /* We allow dropping active logical replication slots on standby in opengauss. */ + /* We allow dropping active logical replication slots on standby or for subscription in opengauss. */ if (active) { - if ((slot->data.database != InvalidOid + if (((slot->data.database != InvalidOid #ifndef ENABLE_MULTIPLE_NODES && !allowDrop #endif - ) || isDummyStandby != slot->data.isDummyStandby) + ) || isDummyStandby != slot->data.isDummyStandby) && strcmp(slot->data.plugin.data, "pgoutput") != 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_IN_USE), errmsg("replication slot \"%s\" is already active", name))); else { ereport(WARNING, diff --git a/src/gausskernel/storage/replication/subscription_walreceiver.cpp b/src/gausskernel/storage/replication/subscription_walreceiver.cpp index 95cd326ef1..00ca2d4a0a 100644 --- a/src/gausskernel/storage/replication/subscription_walreceiver.cpp +++ b/src/gausskernel/storage/replication/subscription_walreceiver.cpp @@ -70,7 +70,8 @@ void sub_startstreaming(const LibpqrcvConnectParam *options) { return StartRemoteStreaming(options); } -void sub_create_slot(const LibpqrcvConnectParam *options) +void sub_create_slot(const LibpqrcvConnectParam *options, XLogRecPtr *lsn, CommitSeqNo *csn) { - return CreateRemoteReplicationSlot(options->startpoint, options->slotname, options->logical); + return CreateRemoteReplicationSlot(options->startpoint, options->slotname, options->logical, lsn, + options->useSnapshot, csn); } diff --git a/src/gausskernel/storage/replication/walreceiver.cpp b/src/gausskernel/storage/replication/walreceiver.cpp index f979180185..a83c04c15f 100755 --- a/src/gausskernel/storage/replication/walreceiver.cpp +++ b/src/gausskernel/storage/replication/walreceiver.cpp @@ -161,7 +161,7 @@ const WalReceiverFunc WalReceiverFuncTable[] = { { archive_connect, archive_receive, archive_send, archive_disconnect, NULL, NULL, NULL, NULL }, { shared_storage_connect, shared_storage_receive, shared_storage_send, shared_storage_disconnect, NULL, NULL, NULL, NULL}, - { sub_connect, libpqrcv_receive, libpqrcv_send, libpqrcv_disconnect, libpqrcv_command, sub_identify_system, + { sub_connect, libpqrcv_receive, libpqrcv_send, libpqrcv_disconnect, libpqrcv_exec, sub_identify_system, sub_startstreaming, sub_create_slot} }; diff --git a/src/gausskernel/storage/replication/walsender.cpp b/src/gausskernel/storage/replication/walsender.cpp index f288c6c120..ab12be020b 100755 --- a/src/gausskernel/storage/replication/walsender.cpp +++ b/src/gausskernel/storage/replication/walsender.cpp @@ -362,6 +362,17 @@ int WalSenderMain(void) */ walsnd_context = AllocSetContextCreate(t_thrd.top_mem_cxt, "Wal Sender", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + t_thrd.mem_cxt.msg_mem_cxt = AllocSetContextCreate(t_thrd.top_mem_cxt, + "MessageContext", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + t_thrd.mem_cxt.mask_password_mem_cxt = AllocSetContextCreate(t_thrd.top_mem_cxt, + "MaskPasswordCtx", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); (void)MemoryContextSwitchTo(walsnd_context); /* Set up resource owner */ @@ -1231,11 +1242,14 @@ int logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int */ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) { +#define MAX_ULONG_LENGTH 22 const char *slot_name = NULL; StringInfoData buf; bool isDummyStandby = false; const char *snapshot_name = NULL; + Snapshot snap; char xpos[MAXFNAMELEN]; + char strCSN[MAX_ULONG_LENGTH]; int rc = 0; Assert(!t_thrd.slot_cxt.MyReplicationSlot); @@ -1266,6 +1280,27 @@ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) ValidateName(cmd->slotname); ValidateName(cmd->plugin); char *fullname = NULL; + + /* + * Do options check early so that we can bail before calling the + * DecodingContextFindStartpoint which can take long time. + */ + if (cmd->useSnapshot) { + if (!IsTransactionBlock()) { + ereport(ERROR, (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called inside a transaction"))); + } + if (u_sess->utils_cxt.XactIsoLevel != XACT_REPEATABLE_READ) { + ereport(ERROR, (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must be called in REPEATABLE READ isolation mode transaction"))); + } + if (u_sess->utils_cxt.FirstSnapshotSet) { + ereport(ERROR, (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT must be called before any query"))); + } + if (IsSubTransaction()) + ereport(ERROR, (errmsg("CREATE_REPLICATION_SLOT ... USE_SNAPSHOT " + "must not be called in a subtransaction"))); + } fullname = expand_dynamic_library_name(cmd->plugin); /* Load the shared library, unless we already did */ @@ -1279,11 +1314,20 @@ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) /* build initial snapshot, might take a while */ DecodingContextFindStartpoint(ctx); - /* - * Export a plain (not of the snapbuild.c type) snapshot to the user - * that can be imported into another session. - */ - snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder); + if (!cmd->useSnapshot) { + /* + * Export a plain (not of the snapbuild.c type) snapshot to the user + * that can be imported into another session. + */ + snapshot_name = SnapBuildExportSnapshot(ctx->snapshot_builder); + } else { + t_thrd.walsender_cxt.isUseSnapshot = true; + + snap = SnapBuildInitialSnapshot(ctx->snapshot_builder); + SetTransactionSnapshot(snap, NULL, InvalidPid); + rc = snprintf_s(strCSN, MAX_ULONG_LENGTH, MAX_ULONG_LENGTH - 1, "%lu", snap->snapshotcsn); + securec_check_ss(rc, "\0", "\0"); + } /* don't need the decoding context anymore */ FreeDecodingContext(ctx); @@ -1307,7 +1351,11 @@ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) * that.) */ pq_beginmessage(&buf, 'T'); - pq_sendint16(&buf, 4); /* 4 field */ + if (cmd->useSnapshot) { + pq_sendint16(&buf, 5); /* 5 field */ + } else { + pq_sendint16(&buf, 4); /* 4 field */ + } /* first field: slot name */ pq_sendstring(&buf, "slot_name"); /* col name */ @@ -1344,11 +1392,26 @@ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) pq_sendint16(&buf, UINT16_MAX); /* typlen */ pq_sendint32(&buf, 0); /* typmod */ pq_sendint16(&buf, 0); /* format code */ + + if (cmd->useSnapshot) { + /* fifth field: use snapshot's csn */ + pq_sendstring(&buf, "snapshot_csn"); /* col name */ + pq_sendint32(&buf, 0); /* table oid */ + pq_sendint16(&buf, 0); /* attnum */ + pq_sendint32(&buf, TEXTOID); /* type oid */ + pq_sendint16(&buf, UINT16_MAX); /* typlen */ + pq_sendint32(&buf, 0); /* typmod */ + pq_sendint16(&buf, 0); /* format code */ + } pq_endmessage_noblock(&buf); /* Send a DataRow message */ pq_beginmessage(&buf, 'D'); - pq_sendint16(&buf, 4); /* # of columns */ + if (cmd->useSnapshot) { + pq_sendint16(&buf, 5); /* # of columns */ + } else { + pq_sendint16(&buf, 4); /* # of columns */ + } /* slot_name */ pq_sendint32(&buf, strlen(slot_name)); /* col1 len */ @@ -1372,6 +1435,12 @@ static void CreateReplicationSlot(CreateReplicationSlotCmd *cmd) pq_sendbytes(&buf, cmd->plugin, strlen(cmd->plugin)); } else pq_sendint32(&buf, UINT32_MAX); /* col4 len, NULL */ + + /* snapshot csn */ + if (cmd->useSnapshot) { + pq_sendint32(&buf, strlen(strCSN)); /* col5 len */ + pq_sendbytes(&buf, strCSN, strlen(strCSN)); + } pq_endmessage_noblock(&buf); /* Send CommandComplete and ReadyForQuery messages */ @@ -2140,6 +2209,14 @@ static void IdentifyCommand(Node* cmd_node, ReplicationCxt* repCxt, const char * break; #endif + case T_SQLCmd: + if (u_sess->proc_cxt.MyDatabaseId == InvalidOid) + ereport(ERROR, (errmsg("not connected to database"))); + execute_simple_query(cmd_string); + /* Send CommandComplete and ReadyForQuery messages */ + ReadyForQuery((CommandDest)t_thrd.postgres_cxt.whereToSendOutput); + break; + default: ereport(FATAL, (errcode(ERRCODE_PROTOCOL_VIOLATION), errmsg("invalid standby query string: %s", cmd_string))); @@ -2191,6 +2268,23 @@ static void HandleWalReplicationCommand(const char *cmd_string, ReplicationCxt* cmd_node = t_thrd.replgram_cxt.replication_parse_result; + /* + * CREATE_REPLICATION_SLOT ... LOGICAL exports a snapshot. If it was + * called outside of transaction the snapshot should be cleared here. + */ + if (!IsTransactionBlock()) + SnapBuildClearExportedSnapshot(); + + /* + * For aborted transactions, don't allow anything except pure SQL, + * the exec_simple_query() will handle it correctly. + */ + if (IsAbortedTransactionBlockState() && !IsA(cmd_node, SQLCmd)) + ereport(ERROR, (errcode(ERRCODE_IN_FAILED_SQL_TRANSACTION), errmsg("current transaction is aborted, " + "commands ignored until end of transaction block"))); + + CHECK_FOR_INTERRUPTS(); + IdentifyCommand(cmd_node, repCxt, cmd_string); /* done */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 18c86015ea..c3366cd63b 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -62,6 +62,6 @@ #define NAILED_IN_CATALOG_NUM 8 -#define CATALOG_NUM 105 +#define CATALOG_NUM 106 #endif diff --git a/src/include/catalog/indexing.h b/src/include/catalog/indexing.h index 6a8236549b..6c1159ee4c 100644 --- a/src/include/catalog/indexing.h +++ b/src/include/catalog/indexing.h @@ -651,6 +651,8 @@ DECLARE_UNIQUE_INDEX(pg_replication_origin_roident_index, 6136, on pg_replicatio DECLARE_UNIQUE_INDEX(pg_replication_origin_roname_index, 6137, on pg_replication_origin using btree(roname text_pattern_ops)); #define ReplicationOriginNameIndex 6137 +DECLARE_UNIQUE_INDEX(pg_subscription_rel_srrelid_srsubid_index, 6138, on pg_subscription_rel using btree(srrelid oid_ops, srsubid oid_ops)); +#define SubscriptionRelSrrelidSrsubidIndexId 6138 /* last step of initialization script: build the indexes declared above */ BUILD_INDICES diff --git a/src/include/catalog/pg_subscription_rel.h b/src/include/catalog/pg_subscription_rel.h new file mode 100644 index 0000000000..9d4b3237ae --- /dev/null +++ b/src/include/catalog/pg_subscription_rel.h @@ -0,0 +1,81 @@ +/* ------------------------------------------------------------------------- + * + * pg_subscription_rel.h + * Local info about tables that come from the publisher of a + * subscription (pg_subscription_rel). + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * ------------------------------------------------------------------------- + */ +#ifndef PG_SUBSCRIPTION_REL_H +#define PG_SUBSCRIPTION_REL_H + +#include "catalog/genbki.h" + +/* ---------------- + * pg_subscription_rel definition. cpp turns this into + * typedef struct FormData_pg_subscription_rel + * ---------------- + */ +#define SubscriptionRelRelationId 6135 +#define SubscriptionRelRelation_Rowtype_Id 6139 + +CATALOG(pg_subscription_rel,6135) BKI_WITHOUT_OIDS BKI_ROWTYPE_OID(6139) BKI_SCHEMA_MACRO +{ + Oid srsubid; /* Oid of subscription */ + Oid srrelid; /* Oid of relation */ + char srsubstate; /* state of the relation in subscription */ + int8 srcsn; /* csn of snapshot used during copy */ +#ifdef CATALOG_VARLEN /* variable-length fields start here */ + text srsublsn; /* remote lsn of the state change + * used for synchronization coordination */ +#endif +} FormData_pg_subscription_rel; + +typedef FormData_pg_subscription_rel *Form_pg_subscription_rel; + +/* ---------------- + * compiler constants for pg_subscription_rel + * ---------------- + */ +#define Natts_pg_subscription_rel 5 +#define Anum_pg_subscription_rel_srsubid 1 +#define Anum_pg_subscription_rel_srrelid 2 +#define Anum_pg_subscription_rel_srsubstate 3 +#define Anum_pg_subscription_rel_srcsn 4 +#define Anum_pg_subscription_rel_srsublsn 5 + +/* ---------------- + * substate constants + * ---------------- + */ +#define SUBREL_STATE_INIT 'i' /* initializing (sublsn NULL) */ +#define SUBREL_STATE_DATASYNC 'd' /* data is being synchronized (sublsn NULL) */ +#define SUBREL_STATE_FINISHEDCOPY 'f' /* tablesync copy phase is completed + * (sublsn NULL) */ +#define SUBREL_STATE_SYNCDONE 's' /* synchronization finished infront of apply (sublsn set) */ +#define SUBREL_STATE_READY 'r' /* ready (sublsn set) */ + +/* These are never stored in the catalog, we only use them for IPC. */ +#define SUBREL_STATE_UNKNOWN '\0' /* unknown state */ +#define SUBREL_STATE_SYNCWAIT 'w' /* waiting for sync */ +#define SUBREL_STATE_CATCHUP 'c' /* catching up with apply */ + +typedef struct SubscriptionRelState +{ + Oid relid; + XLogRecPtr lsn; + char state; +} SubscriptionRelState; + +extern Oid AddSubscriptionRelState(Oid subid, Oid relid, char state); +extern Oid UpdateSubscriptionRelState(Oid subid, Oid relid, char state, XLogRecPtr sublsn, + CommitSeqNo subcsn = InvalidCommitSeqNo); +extern char GetSubscriptionRelState(Oid subid, Oid relid, XLogRecPtr *sublsn, CommitSeqNo *subcsn = NULL); +extern void RemoveSubscriptionRel(Oid subid, Oid relid); + +extern List *GetSubscriptionRelations(Oid subid, bool needNotReady); + +#endif /* PG_SUBSCRIPTION_REL_H */ \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_607.sql b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_607.sql new file mode 100644 index 0000000000..d6fd0ae9c2 --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_maindb/rollback_catalog_maindb_92_607.sql @@ -0,0 +1,37 @@ +-- deleting system table pg_subscription_rel +DROP INDEX IF EXISTS pg_catalog.pg_subscription_rel_srrelid_srsubid_index; +DROP TYPE IF EXISTS pg_catalog.pg_subscription_rel; +DROP TABLE IF EXISTS pg_catalog.pg_subscription_rel; + +DROP FUNCTION IF EXISTS pg_catalog.pg_get_replica_identity_index(regclass); + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT relid oid, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) CASCADE; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 2802; + +CREATE FUNCTION pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) RETURNS record LANGUAGE INTERNAL STABLE AS 'pg_stat_get_subscription'; + +DROP VIEW IF EXISTS pg_catalog.pg_stat_subscription CASCADE; + +do $$DECLARE ans boolean; +BEGIN + for ans in select case when count(*)=1 then true else false end as ans from (select relname from pg_class where relname='pg_subscription') + LOOP + if ans = true then + CREATE VIEW pg_catalog.pg_stat_subscription AS + SELECT + su.oid AS subid, + su.subname, + st.pid, + st.received_lsn, + st.last_msg_send_time, + st.last_msg_receipt_time, + st.latest_end_lsn, + st.latest_end_time + FROM pg_subscription su + LEFT JOIN pg_stat_get_subscription(NULL) st + ON (st.subid = su.oid); + end if; + exit; + END LOOP; +END$$; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_607.sql b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_607.sql new file mode 100644 index 0000000000..d6fd0ae9c2 --- /dev/null +++ b/src/include/catalog/upgrade_sql/rollback_catalog_otherdb/rollback_catalog_otherdb_92_607.sql @@ -0,0 +1,37 @@ +-- deleting system table pg_subscription_rel +DROP INDEX IF EXISTS pg_catalog.pg_subscription_rel_srrelid_srsubid_index; +DROP TYPE IF EXISTS pg_catalog.pg_subscription_rel; +DROP TABLE IF EXISTS pg_catalog.pg_subscription_rel; + +DROP FUNCTION IF EXISTS pg_catalog.pg_get_replica_identity_index(regclass); + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT relid oid, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) CASCADE; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 2802; + +CREATE FUNCTION pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) RETURNS record LANGUAGE INTERNAL STABLE AS 'pg_stat_get_subscription'; + +DROP VIEW IF EXISTS pg_catalog.pg_stat_subscription CASCADE; + +do $$DECLARE ans boolean; +BEGIN + for ans in select case when count(*)=1 then true else false end as ans from (select relname from pg_class where relname='pg_subscription') + LOOP + if ans = true then + CREATE VIEW pg_catalog.pg_stat_subscription AS + SELECT + su.oid AS subid, + su.subname, + st.pid, + st.received_lsn, + st.last_msg_send_time, + st.last_msg_receipt_time, + st.latest_end_lsn, + st.latest_end_time + FROM pg_subscription su + LEFT JOIN pg_stat_get_subscription(NULL) st + ON (st.subid = su.oid); + end if; + exit; + END LOOP; +END$$; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_607.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_607.sql new file mode 100644 index 0000000000..f97cfebd48 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade-post_catalog_maindb_92_607.sql @@ -0,0 +1,26 @@ +-- adding a column for pg_stat_get_subscription + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 2802; + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) CASCADE; + +CREATE FUNCTION pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT relid oid, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) RETURNS record LANGUAGE INTERNAL STABLE AS 'pg_stat_get_subscription'; + +DROP VIEW IF EXISTS pg_catalog.pg_stat_subscription CASCADE; + +CREATE VIEW pg_catalog.pg_stat_subscription AS + SELECT + su.oid AS subid, + su.subname, + st.pid, + st.relid, + st.received_lsn, + st.last_msg_send_time, + st.last_msg_receipt_time, + st.latest_end_lsn, + st.latest_end_time + FROM pg_subscription su + LEFT JOIN pg_stat_get_subscription(NULL) st + ON (st.subid = su.oid); \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_607.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_607.sql new file mode 100644 index 0000000000..cdf4edd539 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_maindb/upgrade_catalog_maindb_92_607.sql @@ -0,0 +1,31 @@ +-- adding system table pg_subscription_rel + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 6135, 6139, 0, 0; + +CREATE TABLE IF NOT EXISTS pg_catalog.pg_subscription_rel +( + srsubid oid NOCOMPRESS NOT NULL, + srrelid oid NOCOMPRESS NOT NULL, + srsubstate "char" NOCOMPRESS NOT NULL, + srcsn bigint NOCOMPRESS NOT NULL, + srsublsn text NOCOMPRESS NOT NULL +) WITHOUT OIDS; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 0, 0, 0, 6138; +CREATE UNIQUE INDEX pg_subscription_rel_srrelid_srsubid_index ON pg_catalog.pg_subscription_rel USING BTREE(srrelid OID_OPS, srsubid OID_OPS); + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 0, 0, 0, 0; + +GRANT SELECT ON TABLE pg_catalog.pg_subscription_rel TO PUBLIC; + +-- adding builtin function pg_get_replica_identity_index + +DROP FUNCTION IF EXISTS pg_catalog.pg_get_replica_identity_index(regclass); + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 6120; + +CREATE OR REPLACE FUNCTION pg_catalog.pg_get_replica_identity_index(regclass) +returns oid +LANGUAGE internal +STABLE STRICT NOT FENCED +AS $function$pg_get_replica_identity_index$function$; \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_607.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_607.sql new file mode 100644 index 0000000000..f97cfebd48 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade-post_catalog_otherdb_92_607.sql @@ -0,0 +1,26 @@ +-- adding a column for pg_stat_get_subscription + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 2802; + +DROP FUNCTION IF EXISTS pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) CASCADE; + +CREATE FUNCTION pg_catalog.pg_stat_get_subscription(IN subid oid, OUT subid oid, OUT pid integer, OUT relid oid, OUT received_lsn text, OUT last_msg_send_time timestamp with time zone, OUT last_msg_receipt_time timestamp with time zone, OUT latest_end_lsn text, OUT latest_end_time timestamp with time zone) RETURNS record LANGUAGE INTERNAL STABLE AS 'pg_stat_get_subscription'; + +DROP VIEW IF EXISTS pg_catalog.pg_stat_subscription CASCADE; + +CREATE VIEW pg_catalog.pg_stat_subscription AS + SELECT + su.oid AS subid, + su.subname, + st.pid, + st.relid, + st.received_lsn, + st.last_msg_send_time, + st.last_msg_receipt_time, + st.latest_end_lsn, + st.latest_end_time + FROM pg_subscription su + LEFT JOIN pg_stat_get_subscription(NULL) st + ON (st.subid = su.oid); \ No newline at end of file diff --git a/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_607.sql b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_607.sql new file mode 100644 index 0000000000..cdf4edd539 --- /dev/null +++ b/src/include/catalog/upgrade_sql/upgrade_catalog_otherdb/upgrade_catalog_otherdb_92_607.sql @@ -0,0 +1,31 @@ +-- adding system table pg_subscription_rel + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 6135, 6139, 0, 0; + +CREATE TABLE IF NOT EXISTS pg_catalog.pg_subscription_rel +( + srsubid oid NOCOMPRESS NOT NULL, + srrelid oid NOCOMPRESS NOT NULL, + srsubstate "char" NOCOMPRESS NOT NULL, + srcsn bigint NOCOMPRESS NOT NULL, + srsublsn text NOCOMPRESS NOT NULL +) WITHOUT OIDS; + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 0, 0, 0, 6138; +CREATE UNIQUE INDEX pg_subscription_rel_srrelid_srsubid_index ON pg_catalog.pg_subscription_rel USING BTREE(srrelid OID_OPS, srsubid OID_OPS); + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_CATALOG, false, true, 0, 0, 0, 0; + +GRANT SELECT ON TABLE pg_catalog.pg_subscription_rel TO PUBLIC; + +-- adding builtin function pg_get_replica_identity_index + +DROP FUNCTION IF EXISTS pg_catalog.pg_get_replica_identity_index(regclass); + +SET LOCAL inplace_upgrade_next_system_object_oids = IUO_PROC, 6120; + +CREATE OR REPLACE FUNCTION pg_catalog.pg_get_replica_identity_index(regclass) +returns oid +LANGUAGE internal +STABLE STRICT NOT FENCED +AS $function$pg_get_replica_identity_index$function$; \ No newline at end of file diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index f19607092f..71ca8be1c2 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -391,13 +391,15 @@ void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum* values, const bool* nul extern void ProcessCopyOptions(CopyState cstate, bool is_from, List* options); extern bool IsTypeAcceptEmptyStr(Oid typeOid); extern CopyState BeginCopyFrom(Relation rel, const char* filename, List* attnamelist, - List* options, void* mem_info, const char* queryString); + List* options, void* mem_info, const char* queryString, CopyGetDataFunc func = NULL); extern void EndCopyFrom(CopyState cstate); extern bool NextCopyFrom(CopyState cstate, ExprContext* econtext, Datum* values, bool* nulls, Oid* tupleOid); extern bool NextCopyFromRawFields(CopyState cstate, char*** fields, int* nfields); extern void CopyFromErrorCallback(void* arg); extern void BulkloadErrorCallback(void* arg); +extern uint64 CopyFrom(CopyState cstate); + extern DestReceiver* CreateCopyDestReceiver(void); extern CopyState begin_dist_copy_from( diff --git a/src/include/commands/subscriptioncmds.h b/src/include/commands/subscriptioncmds.h index 4f14982253..2002bb4113 100644 --- a/src/include/commands/subscriptioncmds.h +++ b/src/include/commands/subscriptioncmds.h @@ -23,7 +23,7 @@ typedef struct HostPort { } HostPort; extern ObjectAddress CreateSubscription(CreateSubscriptionStmt *stmt, bool isTopLevel); -extern ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt); +extern ObjectAddress AlterSubscription(AlterSubscriptionStmt *stmt, bool isTopLevel); extern void DropSubscription(DropSubscriptionStmt *stmt, bool isTopLevel); extern ObjectAddress AlterSubscriptionOwner(const char *name, Oid newOwnerId); diff --git a/src/include/knl/knl_guc/knl_instance_attr_storage.h b/src/include/knl/knl_guc/knl_instance_attr_storage.h index d52797c9bf..74e7d6874c 100755 --- a/src/include/knl/knl_guc/knl_instance_attr_storage.h +++ b/src/include/knl/knl_guc/knl_instance_attr_storage.h @@ -154,6 +154,7 @@ typedef struct knl_instance_attr_storage { int wal_flush_timeout; int wal_flush_delay; int max_logical_replication_workers; + int max_sync_workers_per_subscription; char *redo_bind_cpu_attr; } knl_instance_attr_storage; diff --git a/src/include/knl/knl_session.h b/src/include/knl/knl_session.h index fbd7b338bd..3c214ed66b 100644 --- a/src/include/knl/knl_session.h +++ b/src/include/knl/knl_session.h @@ -623,7 +623,7 @@ typedef struct knl_u_utils_context { */ struct SnapshotData* FirstXactSnapshot; - /* Current xact's exported snapshots (a list of Snapshot structs) */ + /* Current xact's exported snapshots (a list of ExportedSnapshot structs) */ List* exportedSnapshots; uint8_t g_output_version; /* Set the default output schema. */ diff --git a/src/include/knl/knl_thread.h b/src/include/knl/knl_thread.h index f071cca2f8..69a5be22a1 100755 --- a/src/include/knl/knl_thread.h +++ b/src/include/knl/knl_thread.h @@ -2402,6 +2402,7 @@ typedef struct knl_t_walsender_context { bool is_obsmode; bool standbyConnection; bool cancelLogCtl; + bool isUseSnapshot; } knl_t_walsender_context; typedef struct knl_t_walreceiverfuncs_context { @@ -3234,6 +3235,11 @@ typedef struct knl_t_apply_worker_context { MemoryContext messageContext; MemoryContext logicalRepRelMapContext; MemoryContext applyContext; + StringInfo copybuf; + bool tableStatesValid; + List *tableStates; + XLogRecPtr remoteFinalLsn; + CommitSeqNo curRemoteCsn; } knl_t_apply_worker_context; typedef struct knl_t_publication_context { diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 80e2ab6db3..2d102a53a3 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -32,6 +32,8 @@ #include "pgtime.h" /* for pg_time_t */ #include "libpq/libpq-be.h" +#define InvalidPid (-1) + #define PG_BACKEND_VERSIONSTR "gaussdb " DEF_GS_VERSION "\n" /***************************************************************************** @@ -94,6 +96,7 @@ extern const uint32 SUBSCRIPTION_BINARY_VERSION_NUM; extern const uint32 ANALYZER_HOOK_VERSION_NUM; extern const uint32 SUPPORT_HASH_XLOG_VERSION_NUM; extern const uint32 PITR_INIT_VERSION_NUM; +extern const uint32 PUBLICATION_INITIAL_DATA_VERSION_NAME; extern void register_backend_version(uint32 backend_version); extern bool contain_backend_version(uint32 version_number); diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 95f1b3c30a..7210c993e9 100755 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -588,6 +588,7 @@ typedef enum NodeTag { T_AdvanceReplicationCmd, T_StartDataReplicationCmd, T_FetchMotCheckpointCmd, + T_SQLCmd, /* * TAGS FOR RANDOM OTHER STUFF diff --git a/src/include/nodes/parsenodes_common.h b/src/include/nodes/parsenodes_common.h index 8e79f0381d..cd4147dc3d 100644 --- a/src/include/nodes/parsenodes_common.h +++ b/src/include/nodes/parsenodes_common.h @@ -2199,6 +2199,7 @@ typedef struct AlterSubscriptionStmt { NodeTag type; char *subname; /* Name of of the subscription */ List *options; /* List of DefElem nodes */ + bool refresh; } AlterSubscriptionStmt; typedef struct DropSubscriptionStmt { diff --git a/src/include/nodes/replnodes.h b/src/include/nodes/replnodes.h index 20cfdb95b2..e451f44fc6 100644 --- a/src/include/nodes/replnodes.h +++ b/src/include/nodes/replnodes.h @@ -72,6 +72,7 @@ typedef struct CreateReplicationSlotCmd { ReplicationKind kind; XLogRecPtr init_slot_lsn; char* plugin; + bool useSnapshot; } CreateReplicationSlotCmd; /* ---------------------- @@ -123,4 +124,13 @@ typedef struct FetchMotCheckpointCmd { NodeTag type; } FetchMotCheckpointCmd; +/* ---------------------- + * SQL commands + * ---------------------- + */ +typedef struct SQLCmd +{ + NodeTag type; +} SQLCmd; + #endif /* REPLNODES_H */ diff --git a/src/include/pgstat.h b/src/include/pgstat.h index dc25f7b0b3..3148a13790 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1338,6 +1338,9 @@ typedef enum WaitEventIO { WAIT_EVENT_LOGCTRL_SLEEP, WAIT_EVENT_COMPRESS_ADDRESS_FILE_FLUSH, WAIT_EVENT_COMPRESS_ADDRESS_FILE_SYNC, + WAIT_EVENT_LOGICAL_SYNC_DATA, + WAIT_EVENT_LOGICAL_SYNC_STATE_CHANGE, + WAIT_EVENT_REPLICATION_ORIGIN_DROP, IO_EVENT_NUM = WAIT_EVENT_LOGCTRL_SLEEP - WAIT_EVENT_BUFFILE_READ + 1 // MUST be last, DO NOT use this value. } WaitEventIO; diff --git a/src/include/replication/libpqwalreceiver.h b/src/include/replication/libpqwalreceiver.h index ecd6a1a920..ebe25688bb 100755 --- a/src/include/replication/libpqwalreceiver.h +++ b/src/include/replication/libpqwalreceiver.h @@ -25,6 +25,8 @@ #ifndef LIBPQWALRECEIVER_H #define LIBPQWALRECEIVER_H +#include "utils/tuplestore.h" + typedef struct LibpqrcvConnectParam { char* conninfo; XLogRecPtr startpoint; @@ -36,8 +38,35 @@ typedef struct LibpqrcvConnectParam { uint32 protoVersion; /* Logical protocol version */ List *publicationNames; /* String list of publications */ bool binary; /* Ask publisher to use binary */ + bool useSnapshot; /* Use snapshot or not */ }LibpqrcvConnectParam; +/* + * Status of walreceiver query execution. + * + * We only define statuses that are currently used. + */ +typedef enum { + WALRCV_ERROR, /* There was error when executing the query. */ + WALRCV_OK_COMMAND, /* Query executed utility or replication command. */ + WALRCV_OK_TUPLES, /* Query returned tuples. */ + WALRCV_OK_COPY_IN, /* Query started COPY FROM. */ + WALRCV_OK_COPY_OUT, /* Query started COPY TO. */ + WALRCV_OK_COPY_BOTH, /* Query started COPY BOTH replication protocol. */ +} WalRcvExecStatus; + +/* + * Return value for walrcv_query, returns the status of the execution and + * tuples if any. + */ +typedef struct WalRcvExecResult { + WalRcvExecStatus status; + int sqlstate; + char *err; + Tuplestorestate *tuplestore; + TupleDesc tupledesc; +} WalRcvExecResult; + extern int32 pg_atoi(char* s, int size, int c); extern int32 pg_strtoint32(const char* s); /* Prototypes for interface functions */ @@ -49,10 +78,11 @@ extern void libpqrcv_disconnect(void); extern void HaSetRebuildRepInfoError(HaRebuildReason reason); extern void SetObsRebuildReason(HaRebuildReason reason); extern void libpqrcv_check_conninfo(const char *conninfo); -extern bool libpqrcv_command(const char *cmd, char **err, int *sqlstate); +extern WalRcvExecResult* libpqrcv_exec(const char *cmd, const int nRetTypes, const Oid *retTypes); extern void IdentifyRemoteSystem(bool checkRemote); -extern void CreateRemoteReplicationSlot(XLogRecPtr startpoint, const char* slotname, bool isLogical); +extern void CreateRemoteReplicationSlot(XLogRecPtr startpoint, const char* slotname, bool isLogical, XLogRecPtr *lsn, + bool useSnapshot = false, CommitSeqNo *csn = NULL); extern void StartRemoteStreaming(const LibpqrcvConnectParam *options); extern ServerMode IdentifyRemoteMode(); diff --git a/src/include/replication/logicalproto.h b/src/include/replication/logicalproto.h index 4e2876b86e..794e661315 100644 --- a/src/include/replication/logicalproto.h +++ b/src/include/replication/logicalproto.h @@ -77,6 +77,7 @@ typedef struct LogicalRepBeginData { XLogRecPtr final_lsn; TimestampTz committime; TransactionId xid; + CommitSeqNo csn; } LogicalRepBeginData; typedef struct LogicalRepCommitData { diff --git a/src/include/replication/origin.h b/src/include/replication/origin.h index 0343c76790..f5f37fac6d 100644 --- a/src/include/replication/origin.h +++ b/src/include/replication/origin.h @@ -52,6 +52,12 @@ typedef struct ReplicationState { */ ThreadId acquired_by; + pthread_mutex_t originMutex; + + pthread_cond_t orginCV; + + pthread_condattr_t originAttr; + /* * Lock protecting remote_lsn and local_lsn. */ @@ -72,7 +78,7 @@ typedef struct ReplicationStateShmStruct { /* API for querying & manipulating replication origins */ extern RepOriginId replorigin_by_name(const char *name, bool missing_ok); extern RepOriginId replorigin_create(const char *name); -extern void replorigin_drop_by_name(const char *name, bool missing_ok); +extern void replorigin_drop_by_name(const char *name, bool missing_ok, bool nowait); extern bool replorigin_by_oid(RepOriginId roident, bool missing_ok, char **roname); /* API for querying & manipulating replication progress tracking */ diff --git a/src/include/replication/slot.h b/src/include/replication/slot.h index f8bebb5679..dec3ce8c72 100755 --- a/src/include/replication/slot.h +++ b/src/include/replication/slot.h @@ -333,6 +333,7 @@ extern ArchiveTaskStatus* find_archive_task_status(int *idx); extern ArchiveTaskStatus* walreceiver_find_archive_task_status(unsigned int expected_pitr_task_status); extern void get_hadr_cn_info(char* keyCn, bool* isExitKey, char* deleteCn, bool* isExitDelete, ArchiveSlotConfig *archive_conf); - +extern void ReplicationSlotNameForTablesync(Oid suboid, Oid relid, char *syncslotname, int szslot); +extern void ReplicationSlotDropAtPubNode(char *slotname, bool missing_ok); #endif /* SLOT_H */ diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 765542a381..fa3c5c3a90 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -65,6 +65,7 @@ extern void FreeSnapshotBuilder(SnapBuild* cache); extern void SnapBuildSnapDecRefcount(Snapshot snap); +extern Snapshot SnapBuildInitialSnapshot(SnapBuild *builder); extern const char* SnapBuildExportSnapshot(SnapBuild* snapstate); extern void SnapBuildClearExportedSnapshot(void); diff --git a/src/include/replication/subscription_walreceiver.h b/src/include/replication/subscription_walreceiver.h index 51b00d947b..91ca75efb2 100644 --- a/src/include/replication/subscription_walreceiver.h +++ b/src/include/replication/subscription_walreceiver.h @@ -32,5 +32,5 @@ extern bool sub_connect(char *conninfo, XLogRecPtr *startpoint, char *appname, int channel_identifier); extern void sub_identify_system(); extern void sub_startstreaming(const LibpqrcvConnectParam *options); -extern void sub_create_slot(const LibpqrcvConnectParam *options); +extern void sub_create_slot(const LibpqrcvConnectParam *options, XLogRecPtr *lsn, CommitSeqNo *csn); #endif diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index 6d6df41773..5be2ddc731 100755 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -238,10 +238,10 @@ typedef struct WalReceiverFunc { bool (*walrcv_receive)(int timeout, unsigned char* type, char** buffer, int* len); void (*walrcv_send)(const char *buffer, int nbytes); void (*walrcv_disconnect)(); - bool (*walrcv_command)(const char *cmd, char **err, int *sqlstate); + WalRcvExecResult* (*walrcv_exec)(const char *cmd, const int nRetTypes, const Oid *retTypes); void (*walrcv_identify_system)(); void (*walrcv_startstreaming)(const LibpqrcvConnectParam *options); - void (*walrcv_create_slot)(const LibpqrcvConnectParam *options); + void (*walrcv_create_slot)(const LibpqrcvConnectParam *options, XLogRecPtr *lsn, CommitSeqNo *csn); } WalReceiverFunc; #define WalRcvIsOnline() \ @@ -340,4 +340,21 @@ static inline void WalRcvCtlReleaseExitLock(void) volatile WalRcvData *walrcv = t_thrd.walreceiverfuncs_cxt.WalRcv; SpinLockRelease(&walrcv->exitLock); } + +static inline void walrcv_clear_result(WalRcvExecResult *walres) +{ + if (!walres) + return; + + if (walres->err) + pfree(walres->err); + + if (walres->tuplestore) + tuplestore_end(walres->tuplestore); + + if (walres->tupledesc) + FreeTupleDesc(walres->tupledesc); + + pfree(walres); +} #endif /* _WALRECEIVER_H */ diff --git a/src/include/replication/worker_internal.h b/src/include/replication/worker_internal.h index 41656d4499..d6d1f4fc86 100644 --- a/src/include/replication/worker_internal.h +++ b/src/include/replication/worker_internal.h @@ -23,9 +23,6 @@ typedef struct LogicalRepWorker /* Database id to connect to. */ Oid dbid; - /* User to use for connection (will be same as owner of subscription). */ - NameData username; - Oid userid; /* Subscription id for the worker. */ @@ -33,6 +30,10 @@ typedef struct LogicalRepWorker /* Used for initial table synchronization. */ Oid relid; + char relstate; + XLogRecPtr relstate_lsn; + CommitSeqNo relcsn; + slock_t relmutex; TimestampTz workerLaunchTime; @@ -53,10 +54,22 @@ typedef struct ApplyLauncherShmStruct { } ApplyLauncherShmStruct; extern void logicalrep_worker_attach(); +extern LogicalRepWorker *logicalrep_worker_find(Oid subid, Oid relid, bool only_running); extern List *logicalrep_workers_find(Oid subid, bool only_running); -extern void logicalrep_worker_stop(Oid subid); +extern void logicalrep_worker_launch(Oid dbid, Oid subid, const char *subname, Oid userid, Oid relid); +extern void logicalrep_worker_stop(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup(Oid subid, Oid relid); +extern void logicalrep_worker_wakeup_ptr(LogicalRepWorker *worker); + +extern int logicalrep_sync_worker_count(Oid subid); +extern void ReplicationOriginNameForTablesync(Oid suboid, Oid relid, char *originname, int szorgname); extern char* DefListToString(const List *defList); extern List* ConninfoToDefList(const char *conn); +extern char *LogicalRepSyncTableStart(XLogRecPtr *origin_startpos); +void process_syncing_tables(XLogRecPtr current_lsn); +void invalidate_syncing_table_states(Datum arg, int cacheid, uint32 hashvalue); + +#define AM_TABLESYNC_WORKER (OidIsValid(t_thrd.applyworker_cxt.curWorker->relid)) #endif /* WORKER_INTERNAL_H */ diff --git a/src/include/storage/predicate.h b/src/include/storage/predicate.h index 716e26e0f9..eac253a2bd 100644 --- a/src/include/storage/predicate.h +++ b/src/include/storage/predicate.h @@ -35,7 +35,7 @@ extern bool PageIsPredicateLocked(Relation relation, BlockNumber blkno); /* predicate lock maintenance */ extern Snapshot GetSerializableTransactionSnapshot(Snapshot snapshot); -extern void SetSerializableTransactionSnapshot(Snapshot snapshot, TransactionId sourcexid); +extern void SetSerializableTransactionSnapshot(Snapshot snapshot, VirtualTransactionId *sourcevxid, int sourcepid); extern void RegisterPredicateLockingXid(TransactionId xid); extern void PredicateLockRelation(Relation relation, Snapshot snapshot); extern void PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot); diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 151b5b5d59..a301960bff 100755 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -70,7 +70,7 @@ extern Snapshot GetSnapshotData(Snapshot snapshot, bool force_local_snapshot); extern Snapshot GetLocalSnapshotData(Snapshot snapshot); -extern bool ProcArrayInstallImportedXmin(TransactionId xmin, TransactionId sourcexid); +extern bool ProcArrayInstallImportedXmin(TransactionId xmin, VirtualTransactionId *sourcevxid); extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsActive(TransactionId xid); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 1f835b31ae..8aebbfc92f 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -642,6 +642,7 @@ extern Datum oidvectorge(PG_FUNCTION_ARGS); extern Datum oidvectorgt(PG_FUNCTION_ARGS); extern oidvector* buildoidvector(const Oid* oids, int n); extern Oid oidparse(Node* node); +extern int oid_cmp(const void *p1, const void *p2); /* pseudotypes.c */ extern Datum cstring_in(PG_FUNCTION_ARGS); diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 122f03678b..7405b0c81f 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -115,4 +115,6 @@ extern struct HTAB* HistoricSnapshotGetTupleCids(void); extern void SetupHistoricSnapshot(Snapshot snapshot_now, struct HTAB* tuplecids); extern void TeardownHistoricSnapshot(bool is_error); extern bool HistoricSnapshotActive(void); + +extern void SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, int sourcepid); #endif /* SNAPMGR_H */ diff --git a/src/include/utils/syscache.h b/src/include/utils/syscache.h index a1b45496a8..26a63136ef 100644 --- a/src/include/utils/syscache.h +++ b/src/include/utils/syscache.h @@ -61,7 +61,7 @@ enum SysCacheIdentifier { DATABASEOID, DATASOURCENAME, DATASOURCEOID, - DB4AI_MODELOID, + SUBSCRIPTIONRELMAP, DB4AI_MODEL, DEFACLROLENSPOBJ, DIRECTORYNAME, diff --git a/src/test/ha/pubsub_check.sh b/src/test/ha/pubsub_check.sh new file mode 100644 index 0000000000..3ed2e4fe34 --- /dev/null +++ b/src/test/ha/pubsub_check.sh @@ -0,0 +1,73 @@ +#!/bin/sh +source ./util.sh + +pub_datadir="$data_dir/pub" +pub_port=8988 +conn_port=8989 +sub_datadir="$data_dir/sub" +sub_port=8990 + +function test_1() +{ + echo "init database and start" + gs_initdb -D $pub_datadir --nodename='sgnode' -w Gauss@123 + gs_initdb -D $sub_datadir --nodename='sgnode' -w Gauss@123 + + gs_guc set -D $pub_datadir -c "wal_level = logical" + gs_guc set -D $sub_datadir -c "wal_level = logical" + + gs_guc generate -S Gauss@123 -D $GAUSSHOME/bin -o subscription + sed -i '$ahost replication '"$username"' 127.0.0.1/32 sha256' $pub_datadir/pg_hba.conf + + gaussdb -D $pub_datadir -p $pub_port & + gaussdb -D $sub_datadir -p $sub_port & + + sleep 10 + + echo "create tables and insert data" + gsql -d postgres -p $pub_port -c "create table t1 (a int, b int); insert into t1 values (1,1),(2,2),(3,3); create table t2(a int); insert into t2 values (1),(2),(3);" + gsql -d postgres -p $sub_port -c "create table t1 (a int, b int); create table t2(a int);" + + echo "create publication for t1" + gsql -d postgres -p $pub_port -c "create publication pub1 for table t1;" + + echo "create subscription for pub1 and check initial data" + gsql -d postgres -p $sub_port -c "create subscription sub1 connection 'host=127.0.0.1 port=$conn_port user=$username dbname=postgres password=Gauss@123' publication pub1;" + sleep 2 + if [ $(gsql -d postgres -p $sub_port -c "select count(*) from t1;" | grep `expr 1 \* 3` |wc -l) -eq 1 ]; then + echo "initial data on t1 synchronize success" + else + echo "initial data on t1 synchronize $failed_keyword" + exit 1 + fi + + echo "alter pub1 add t2 and refresh sub" + gsql -d postgres -p $pub_port -c "alter publication pub1 add table t2;" + gsql -d postgres -p $sub_port -c "alter subscription sub1 refresh publication;" + sleep 2 + if [ $(gsql -d postgres -p $sub_port -c "select count(*) from t2;" | grep `expr 1 \* 3` |wc -l) -eq 1 ]; then + echo "initial data on t2 synchronize success" + else + echo "initial data on t2 synchronize $failed_keyword" + exit 1 + fi + + echo "test incremental data" + gsql -d postgres -p $pub_port -c "insert into t2 values (4);" + sleep 2 + if [ $(gsql -d postgres -p $sub_port -c "select count(*) from t2;" | grep `expr 1 \* 4` |wc -l) -eq 1 ]; then + echo "incremental data on t2 synchronize success" + else + echo "incremental data on t2 synchronize $failed_keyword" + exit 1 + fi +} + +function tear_down() { + ps -ef | grep -w $pub_datadir | grep -v grep | awk '{print $2}' | xargs kill -9 + ps -ef | grep -w $sub_datadir | grep -v grep | awk '{print $2}' | xargs kill -9 +} + +test_1 > ./results/pubsub_check.log 2>&1 +tear_down >> ./results/pubsub_check.log 2>&1 +echo "publication and subscription test ok." \ No newline at end of file diff --git a/src/test/regress/input/publication.source b/src/test/regress/input/publication.source index 6195e47770..2c25c8fdfc 100644 --- a/src/test/regress/input/publication.source +++ b/src/test/regress/input/publication.source @@ -98,4 +98,8 @@ SELECT object_name,detail_info FROM pg_query_audit('2022-01-13 9:30:00', '2031-1 --clear audit log SELECT pg_delete_audit('1012-11-10', '3012-11-11'); +create table replica_test (a int primary key, b int); +select pg_get_replica_identity_index('replica_test'); +drop table replica_test; + \! @abs_bindir@/gs_guc reload -D @abs_srcdir@/tmp_check/datanode1/ -c "audit_system_object" > /dev/null 2>&1 diff --git a/src/test/regress/input/subscription.source b/src/test/regress/input/subscription.source index 2d5a4c4e98..9f0331861f 100644 --- a/src/test/regress/input/subscription.source +++ b/src/test/regress/input/subscription.source @@ -79,6 +79,9 @@ select subname, subenabled from pg_subscription where subname='testsub_rename'; ALTER SUBSCRIPTION testsub_rename SET (ENABLED=false); select subname, subenabled from pg_subscription where subname='testsub_rename'; COMMIT; +BEGIN; +ALTER SUBSCRIPTION testsub_rename REFRESH PUBLICATION; +COMMIT; --- drop subscription DROP SUBSCRIPTION IF EXISTS testsub_rename; DROP SUBSCRIPTION IF EXISTS testsub_maskconninfo; diff --git a/src/test/regress/output/publication.source b/src/test/regress/output/publication.source index b8cbc25b07..a3b6718bfd 100644 --- a/src/test/regress/output/publication.source +++ b/src/test/regress/output/publication.source @@ -219,4 +219,13 @@ SELECT pg_delete_audit('1012-11-10', '3012-11-11'); (1 row) +create table replica_test (a int primary key, b int); +NOTICE: CREATE TABLE / PRIMARY KEY will create implicit index "replica_test_pkey" for table "replica_test" +select pg_get_replica_identity_index('replica_test'); + pg_get_replica_identity_index +------------------------------- + replica_test_pkey +(1 row) + +drop table replica_test; \! @abs_bindir@/gs_guc reload -D @abs_srcdir@/tmp_check/datanode1/ -c "audit_system_object" > /dev/null 2>&1 diff --git a/src/test/regress/output/subscription.source b/src/test/regress/output/subscription.source index 7bf3634089..0bbe46dbd7 100644 --- a/src/test/regress/output/subscription.source +++ b/src/test/regress/output/subscription.source @@ -17,6 +17,7 @@ Description: change the definition of a subscription Syntax: ALTER SUBSCRIPTION name CONNECTION 'conninfo' ALTER SUBSCRIPTION name SET PUBLICATION publication_name [, ...] +ALTER SUBSCRIPTION name REFRESH PUBLICATION [ WITH ( refresh_option [= value] [, ... ] ) ] ALTER SUBSCRIPTION name ENABLE ALTER SUBSCRIPTION name SET ( subscription_parameter [= value] [, ... ] ) ALTER SUBSCRIPTION name OWNER TO new_owner @@ -158,6 +159,10 @@ ERROR: current transaction is aborted, commands ignored until end of transactio select subname, subenabled from pg_subscription where subname='testsub_rename'; ERROR: current transaction is aborted, commands ignored until end of transaction block, firstChar[Q] COMMIT; +BEGIN; +ALTER SUBSCRIPTION testsub_rename REFRESH PUBLICATION; +ERROR: ALTER SUBSCRIPTION ... REFRESH cannot run inside a transaction block +COMMIT; --- drop subscription DROP SUBSCRIPTION IF EXISTS testsub_rename; DROP SUBSCRIPTION IF EXISTS testsub_maskconninfo; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b44b866c12..a08f9357ef 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1501,6 +1501,7 @@ SPLITCOST SPNode SPNodeData SPPageDesc +SQLCmd SQLFunctionCache SQLFunctionCachePtr SQLFunctionParseInfoPtr -- Gitee