diff --git a/KunpengBoostKit22.0.RC4-CODE-SCHED-AFFINITY-FOR-MySQL-8.0.25.patch b/KunpengBoostKit22.0.RC4-CODE-SCHED-AFFINITY-FOR-MySQL-8.0.25.patch new file mode 100644 index 0000000000000000000000000000000000000000..dbc0e3c63ec88184854d543405db338f2749166a --- /dev/null +++ b/KunpengBoostKit22.0.RC4-CODE-SCHED-AFFINITY-FOR-MySQL-8.0.25.patch @@ -0,0 +1,1587 @@ +diff --git a/config.h.cmake b/config.h.cmake +index 2b06257a..17683fd1 100644 +--- a/config.h.cmake ++++ b/config.h.cmake +@@ -344,6 +344,7 @@ + + #define SO_EXT "@CMAKE_SHARED_MODULE_SUFFIX@" + ++#cmakedefine GMOCK_FOUND 1 + + /* From libmysql/CMakeLists.txt */ + #cmakedefine HAVE_UNIX_DNS_SRV @HAVE_UNIX_DNS_SRV@ +diff --git a/share/messages_to_clients.txt b/share/messages_to_clients.txt +index 7ff9af75..adc4c487 100644 +--- a/share/messages_to_clients.txt ++++ b/share/messages_to_clients.txt +@@ -9541,6 +9541,15 @@ ER_SDI_GET_KEYS_INVALID_TABLESPACE + ER_CHANGE_RPL_SRC_WRONG_COMPRESSION_ALGORITHM_SIZE + eng "Value too long setting SOURCE_COMPRESSION_ALGORITHMS option to a %d chars long string for channel '%.192s'." + ++ER_INVALID_CPU_STRING ++ eng "Invalid cpu string %s." ++ ++ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER ++ eng "Cannot update %s successfully." ++ ++ER_CANNOT_UPDATE_SCHED_AFFINITY_NUMA_AWARE ++ eng "Cannot update sched_affinity_numa_aware successfully." ++ + # + # End of 8.0 error messages (server-to-client). + # Do NOT add messages intended for the error log above! +diff --git a/share/messages_to_error_log.txt b/share/messages_to_error_log.txt +index 97f50ddf..2ab19240 100644 +--- a/share/messages_to_error_log.txt ++++ b/share/messages_to_error_log.txt +@@ -11262,6 +11262,45 @@ ER_IB_WRN_FAILED_TO_ACQUIRE_SERVICE + ER_IB_WRN_OLD_GEOMETRY_TYPE + eng "Column %s of type GEOMETRY is in old (5.6) format which could be deprecated in the future. To change the format to latest, please consider rebuilding the table after the upgrade." + ++ER_CANT_PARSE_CPU_STRING ++ eng "Cannot parse cpu string '%s'." ++ ++ER_LIBNUMA_TEST_FAIL ++ eng "libnuma test fail." ++ ++ER_NUMA_AVAILABLE_TEST_FAIL ++ eng "numa_available test fail." ++ ++ER_CANNOT_SET_THREAD_SCHED_AFFINIFY ++ eng "Cannot set thread %s sched affinity." ++ ++ER_CANNOT_UNSET_THREAD_SCHED_AFFINIFY ++ eng "Cannot unset thread %s sched affinity." ++ ++ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER ++ eng "Cannot register thread %s sched affinity manager." ++ ++ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER ++ eng "Cannot unregister thread %s sched affinity manager." ++ ++ER_USE_DUMMY_SCHED_AFFINITY_MANAGER ++ eng "use dummy sched_affinity_manager." ++ ++ER_SCHED_AFFINITY_THREAD_PROCESS_CONFLICT ++ eng "Found sched affinity conflict between threads and process." ++ ++ER_SCHED_AFFINITY_FOREGROUND_BACKGROUND_CONFLICT ++ eng "Found sched affinity conflict between foreground threads and background threads." ++ ++ER_CANNOT_CREATE_SCHED_AFFINITY_MANAGER ++ eng "Cannot create sched affinity manager." ++ ++ER_SET_FALLBACK_MODE ++ eng "sched_affinity_manager is set to fallback mode." ++ ++ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED ++ eng "sched_affinity_manager is in fallback mode. A fallback version of sched_affinity_manager is called, which does nothing." ++ + # DO NOT add server-to-client messages here; + # they go in messages_to_clients.txt + # in the same directory as this file. +diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt +index 5fd225c1..27956a67 100644 +--- a/sql/CMakeLists.txt ++++ b/sql/CMakeLists.txt +@@ -21,6 +21,7 @@ + # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + INCLUDE_DIRECTORIES(SYSTEM ${BOOST_PATCHES_DIR} ${BOOST_INCLUDE_DIR}) ++INCLUDE_DIRECTORIES(${GMOCK_INCLUDE_DIRS}) + + MY_INCLUDE_SYSTEM_DIRECTORIES(ICU) + +@@ -473,6 +474,7 @@ SET(SQL_SHARED_SOURCES + rpl_write_set_handler.cc + rules_table_service.cc + rwlock_scoped_lock.cc ++ sched_affinity_manager.cc + sd_notify.cc + sdi_utils.cc + session_tracker.cc +diff --git a/sql/conn_handler/connection_handler_per_thread.cc b/sql/conn_handler/connection_handler_per_thread.cc +index 0668c4f5..8a60d823 100644 +--- a/sql/conn_handler/connection_handler_per_thread.cc ++++ b/sql/conn_handler/connection_handler_per_thread.cc +@@ -24,6 +24,7 @@ + + #include + #include ++#include + #include + #include + +@@ -55,6 +56,7 @@ + #include "sql/mysqld.h" // max_connections + #include "sql/mysqld_thd_manager.h" // Global_THD_manager + #include "sql/protocol_classic.h" ++#include "sql/sched_affinity_manager.h" + #include "sql/sql_class.h" // THD + #include "sql/sql_connect.h" // close_connection + #include "sql/sql_error.h" +@@ -294,6 +296,18 @@ static void *handle_connection(void *arg) { + mysql_socket_set_thread_owner(socket); + thd_manager->add_thd(thd); + ++ auto sched_affinity_manager = ++ sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager == nullptr || ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::FOREGROUND, pid))) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER, ++ "foreground"); ++ } ++ + if (thd_prepare_connection(thd)) + handler_manager->inc_aborted_connects(); + else { +@@ -304,6 +318,13 @@ static void *handle_connection(void *arg) { + } + close_connection(thd, 0, false, false); + ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ LogErr(ERROR_LEVEL, ++ ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER, ++ "foreground"); ++ } ++ + thd->get_stmt_da()->reset_diagnostics_area(); + thd->release_resources(); + +diff --git a/sql/memory/aligned_atomic.h b/sql/memory/aligned_atomic.h +index 3fed8df6..a668bce1 100644 +--- a/sql/memory/aligned_atomic.h ++++ b/sql/memory/aligned_atomic.h +@@ -77,7 +77,9 @@ static inline size_t _cache_line_size() { + + #elif defined(__linux__) + static inline size_t _cache_line_size() { +- return sysconf(_SC_LEVEL1_DCACHE_LINESIZE); ++ long size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); ++ if (size == -1 || size == 0) return 64; ++ return static_cast(size); + } + + #else +diff --git a/sql/mysqld.cc b/sql/mysqld.cc +index 83643f76..7a4173fb 100644 +--- a/sql/mysqld.cc ++++ b/sql/mysqld.cc +@@ -810,6 +810,7 @@ MySQL clients support the protocol: + #include "sql/rpl_rli.h" // Relay_log_info + #include "sql/rpl_slave.h" // slave_load_tmpdir + #include "sql/rpl_trx_tracking.h" ++#include "sql/sched_affinity_manager.h" + #include "sql/sd_notify.h" // sd_notify_connect + #include "sql/session_tracker.h" + #include "sql/set_var.h" +@@ -1171,6 +1172,9 @@ uint host_cache_size; + ulong log_error_verbosity = 3; // have a non-zero value during early start-up + bool opt_keyring_migration_to_component = false; + ++extern std::map sched_affinity_parameter; ++extern bool sched_affinity_numa_aware; ++ + #if defined(_WIN32) + /* + Thread handle of shutdown event handler thread. +@@ -2582,6 +2586,7 @@ static void clean_up(bool print_message) { + */ + sys_var_end(); + free_status_vars(); ++ sched_affinity::Sched_affinity_manager::free_instance(); + + finish_client_errs(); + deinit_errmessage(); // finish server errs +@@ -7168,6 +7173,11 @@ int mysqld_main(int argc, char **argv) + /* Determine default TCP port and unix socket name */ + set_ports(); + ++ if (sched_affinity::Sched_affinity_manager::create_instance(sched_affinity_parameter, sched_affinity_numa_aware) == nullptr) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_CREATE_SCHED_AFFINITY_MANAGER); ++ unireg_abort(MYSQLD_ABORT_EXIT); ++ } ++ + if (init_server_components()) unireg_abort(MYSQLD_ABORT_EXIT); + + if (!server_id_supplied) +@@ -8591,6 +8601,31 @@ static int show_queries(THD *thd, SHOW_VAR *var, char *) { + return 0; + } + ++static int show_sched_affinity_status(THD *, SHOW_VAR *var, char *buff) { ++ var->type = SHOW_CHAR; ++ var->value = buff; ++ std::string group_snapshot = sched_affinity::Sched_affinity_manager::get_instance()->take_group_snapshot(); ++ strncpy(buff, group_snapshot.c_str(), SHOW_VAR_FUNC_BUFF_SIZE); ++ buff[SHOW_VAR_FUNC_BUFF_SIZE]='\0'; ++ return 0; ++} ++ ++static int show_sched_affinity_group_number(THD *, SHOW_VAR *var, char *buff) { ++ var->type = SHOW_SIGNED_INT; ++ var->value = buff; ++ *(reinterpret_cast(buff)) = sched_affinity::Sched_affinity_manager::get_instance() ++ ->get_total_node_number(); ++ return 0; ++} ++ ++static int show_sched_affinity_group_capacity(THD *, SHOW_VAR *var, char *buff) { ++ var->type = SHOW_SIGNED_INT; ++ var->value = buff; ++ *(reinterpret_cast(buff)) = sched_affinity::Sched_affinity_manager::get_instance() ++ ->get_cpu_number_per_node(); ++ return 0; ++} ++ + static int show_net_compression(THD *thd, SHOW_VAR *var, char *buff) { + var->type = SHOW_MY_BOOL; + var->value = buff; +@@ -9183,6 +9218,12 @@ SHOW_VAR status_vars[] = { + {"Queries", (char *)&show_queries, SHOW_FUNC, SHOW_SCOPE_ALL}, + {"Questions", (char *)offsetof(System_status_var, questions), + SHOW_LONGLONG_STATUS, SHOW_SCOPE_ALL}, ++ {"Sched_affinity_status", ++ (char *)&show_sched_affinity_status, SHOW_FUNC, SHOW_SCOPE_ALL}, ++ {"Sched_affinity_group_number", ++ (char *)&show_sched_affinity_group_number, SHOW_FUNC, SHOW_SCOPE_ALL}, ++ {"Sched_affinity_group_capacity", ++ (char *)&show_sched_affinity_group_capacity, SHOW_FUNC, SHOW_SCOPE_ALL}, + {"Secondary_engine_execution_count", + (char *)offsetof(System_status_var, secondary_engine_execution_count), + SHOW_LONGLONG_STATUS, SHOW_SCOPE_ALL}, +@@ -11022,6 +11063,7 @@ PSI_mutex_key key_thd_timer_mutex; + PSI_mutex_key key_commit_order_manager_mutex; + PSI_mutex_key key_mutex_slave_worker_hash; + PSI_mutex_key key_monitor_info_run_lock; ++PSI_mutex_key key_sched_affinity_mutex; + + /* clang-format off */ + static PSI_mutex_info all_server_mutexes[]= +@@ -11107,7 +11149,8 @@ static PSI_mutex_info all_server_mutexes[]= + { &key_LOCK_tls_ctx_options, "LOCK_tls_ctx_options", 0, 0, "A lock to control all of the --ssl-* CTX related command line options for client server connection port"}, + { &key_LOCK_admin_tls_ctx_options, "LOCK_admin_tls_ctx_options", 0, 0, "A lock to control all of the --ssl-* CTX related command line options for administrative connection port"}, + { &key_LOCK_rotate_binlog_master_key, "LOCK_rotate_binlog_master_key", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}, +- { &key_monitor_info_run_lock, "Source_IO_monitor::run_lock", 0, 0, PSI_DOCUMENT_ME} ++ { &key_monitor_info_run_lock, "Source_IO_monitor::run_lock", 0, 0, PSI_DOCUMENT_ME}, ++ { &key_sched_affinity_mutex, "Sched_affinity::m_mutex", 0, 0, PSI_DOCUMENT_ME} + }; + /* clang-format on */ + +diff --git a/sql/mysqld.h b/sql/mysqld.h +index a7a80a22..9c721c34 100644 +--- a/sql/mysqld.h ++++ b/sql/mysqld.h +@@ -442,6 +442,8 @@ extern PSI_mutex_key key_monitor_info_run_lock; + extern PSI_mutex_key key_commit_order_manager_mutex; + extern PSI_mutex_key key_mutex_slave_worker_hash; + ++extern PSI_mutex_key key_sched_affinity_mutex; ++ + extern PSI_rwlock_key key_rwlock_LOCK_logger; + extern PSI_rwlock_key key_rwlock_channel_map_lock; + extern PSI_rwlock_key key_rwlock_channel_lock; +diff --git a/sql/sched_affinity_manager.cc b/sql/sched_affinity_manager.cc +new file mode 100644 +index 00000000..c68f774c +--- /dev/null ++++ b/sql/sched_affinity_manager.cc +@@ -0,0 +1,616 @@ ++/***************************************************************************** ++Copyright (c) 2022, Huawei Technologies Co., Ltd. All Rights Reserved. ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License, version 2.0, as published by the ++Free Software Foundation. ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, ++for more details. ++*****************************************************************************/ ++ ++#include "sql/sched_affinity_manager.h" ++ ++#include ++ ++#include ++ ++#include "mysql/components/services/log_builtins.h" ++#include "mysqld_error.h" ++#include "sql/mysqld.h" ++ ++namespace sched_affinity { ++const std::vector thread_types = { ++ Thread_type::FOREGROUND, Thread_type::LOG_WRITER, ++ Thread_type::LOG_FLUSHER, Thread_type::LOG_WRITE_NOTIFIER, ++ Thread_type::LOG_FLUSH_NOTIFIER, Thread_type::LOG_CHECKPOINTER, ++ Thread_type::PURGE_COORDINATOR}; ++ ++const std::map thread_type_names = { ++ {Thread_type::FOREGROUND, "foreground"}, ++ {Thread_type::LOG_WRITER, "log_writer"}, ++ {Thread_type::LOG_FLUSHER, "log_flusher"}, ++ {Thread_type::LOG_WRITE_NOTIFIER, "log_write_notifier"}, ++ {Thread_type::LOG_FLUSH_NOTIFIER, "log_flush_notifier"}, ++ {Thread_type::LOG_CHECKPOINTER, "log_checkpointer"}, ++ {Thread_type::PURGE_COORDINATOR, "purge_coordinator"}, ++ {Thread_type::UNDEFINED, "undefined"}}; ++} // namespace sched_affinity ++ ++ ++#ifdef HAVE_LIBNUMA ++namespace sched_affinity { ++class Lock_guard { ++ public: ++ explicit Lock_guard(mysql_mutex_t &mutex) { ++ m_mutex = &mutex; ++ mysql_mutex_lock(m_mutex); ++ } ++ Lock_guard(const Lock_guard &) = delete; ++ Lock_guard &operator=(const Lock_guard &) = delete; ++ ~Lock_guard() { mysql_mutex_unlock(m_mutex); } ++ ++ private: ++ mysql_mutex_t *m_mutex; ++}; ++ ++ ++Sched_affinity_manager_numa::Sched_affinity_manager_numa() ++ : Sched_affinity_manager(), ++ m_total_cpu_num(0), ++ m_total_node_num(0), ++ m_cpu_num_per_node(0), ++ m_numa_aware(false), ++ m_root_pid(0), ++ m_is_fallback(false) { ++ mysql_mutex_init(key_sched_affinity_mutex, &m_mutex, nullptr); ++} ++ ++Sched_affinity_manager_numa::~Sched_affinity_manager_numa() { ++ mysql_mutex_destroy(&m_mutex); ++} ++ ++ ++bool Sched_affinity_manager_numa::init( ++ const std::map &sched_affinity_parameter, ++ bool numa_aware) { ++ m_total_cpu_num = numa_num_configured_cpus(); ++ m_total_node_num = numa_num_configured_nodes(); ++ m_cpu_num_per_node = m_total_cpu_num / m_total_node_num; ++ m_numa_aware = numa_aware; ++ m_root_pid = gettid(); ++ ++ m_thread_bitmask.clear(); ++ m_sched_affinity_groups.clear(); ++ m_thread_pid.clear(); ++ for (const auto &thread_type : thread_types) { ++ if (sched_affinity_parameter.find(thread_type) == ++ sched_affinity_parameter.end()) { ++ continue; ++ } ++ m_thread_pid[thread_type] = std::set(); ++ auto cpu_string = sched_affinity_parameter.at(thread_type); ++ if (!init_sched_affinity_info( ++ cpu_string == nullptr ? std::string("") : std::string(cpu_string), ++ m_thread_bitmask[thread_type])) { ++ return false; ++ } ++ if (is_thread_sched_enabled(thread_type) && ++ !init_sched_affinity_group( ++ m_thread_bitmask[thread_type], ++ m_numa_aware && thread_type == Thread_type::FOREGROUND, ++ m_sched_affinity_groups[thread_type])) { ++ return false; ++ } ++ } ++ ++ return true; ++} ++ ++void Sched_affinity_manager_numa::fallback() { ++ if (!m_is_fallback) { ++ m_is_fallback = true; ++ m_fallback_delegate.reset(new Sched_affinity_manager_dummy()); ++ LogErr(ERROR_LEVEL, ER_SET_FALLBACK_MODE); ++ } ++} ++ ++bool Sched_affinity_manager_numa::init_sched_affinity_info( ++ const std::string &cpu_string, Bitmask_ptr &group_bitmask) { ++ group_bitmask.reset(); ++ if (cpu_string.empty()) { ++ return true; ++ } ++ std::pair normalized_result = ++ normalize_cpu_string(cpu_string); ++ if (normalized_result.second == false) { ++ LogErr(ERROR_LEVEL, ER_CANT_PARSE_CPU_STRING, cpu_string.c_str()); ++ return false; ++ } ++ group_bitmask.reset(numa_parse_cpustring(normalized_result.first.c_str())); ++ if (!group_bitmask) { ++ LogErr(ERROR_LEVEL, ER_CANT_PARSE_CPU_STRING, cpu_string.c_str()); ++ return false; ++ } ++ return true; ++} ++ ++bool Sched_affinity_manager_numa::init_sched_affinity_group( ++ const Bitmask_ptr &group_bitmask, const bool numa_aware, ++ std::vector &sched_affinity_group) { ++ if (numa_aware) { ++ sched_affinity_group.resize(m_total_node_num); ++ for (auto node_id = 0; node_id < m_total_node_num; ++node_id) { ++ sched_affinity_group[node_id].avail_cpu_num = 0; ++ sched_affinity_group[node_id].avail_cpu_mask = ++ Bitmask_ptr(numa_allocate_cpumask()); ++ sched_affinity_group[node_id].assigned_thread_num = 0; ++ for (auto cpu_id = m_cpu_num_per_node * node_id; ++ cpu_id < m_cpu_num_per_node * (node_id + 1); ++cpu_id) { ++ if (numa_bitmask_isbitset(group_bitmask.get(), cpu_id)) { ++ numa_bitmask_setbit( ++ sched_affinity_group[node_id].avail_cpu_mask.get(), cpu_id); ++ ++sched_affinity_group[node_id].avail_cpu_num; ++ } ++ } ++ } ++ } else { ++ sched_affinity_group.resize(1); ++ sched_affinity_group[0].avail_cpu_num = 0; ++ sched_affinity_group[0].avail_cpu_mask = ++ Bitmask_ptr(numa_allocate_cpumask()); ++ copy_bitmask_to_bitmask(group_bitmask.get(), ++ sched_affinity_group[0].avail_cpu_mask.get()); ++ sched_affinity_group[0].assigned_thread_num = 0; ++ for (auto cpu_id = 0; cpu_id < m_total_cpu_num; ++cpu_id) { ++ if (numa_bitmask_isbitset(group_bitmask.get(), cpu_id)) { ++ ++sched_affinity_group[0].avail_cpu_num; ++ } ++ } ++ } ++ return true; ++} ++ ++ ++bool Sched_affinity_manager_numa::rebalance_group( ++ const char *cpu_string, const Thread_type thread_type) { ++ const Lock_guard lock(m_mutex); ++ if (m_is_fallback) { ++ LogErr(ERROR_LEVEL, ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED); ++ return m_fallback_delegate->rebalance_group(cpu_string, thread_type); ++ } ++ const bool is_previous_sched_enabled = is_thread_sched_enabled(thread_type); ++ std::vector> group_thread; ++ if (!reset_sched_affinity_info(cpu_string, thread_type, group_thread)) { ++ fallback(); ++ return false; ++ } ++ if (!is_thread_sched_enabled(thread_type) && !is_previous_sched_enabled) { ++ return true; ++ } ++ if (!is_thread_sched_enabled(thread_type) && is_previous_sched_enabled) { ++ Bitmask_ptr root_process_bitmask(numa_allocate_cpumask()); ++ if (numa_sched_getaffinity(m_root_pid, root_process_bitmask.get()) < 0) { ++ fallback(); ++ return false; ++ } ++ for (const auto tid : m_thread_pid[thread_type]) { ++ m_pid_group_id.erase(tid); ++ if (numa_sched_setaffinity(tid, root_process_bitmask.get()) < 0) { ++ fallback(); ++ return false; ++ } ++ } ++ return true; ++ } ++ if (is_thread_sched_enabled(thread_type) && !is_previous_sched_enabled) { ++ for (const auto tid : m_thread_pid[thread_type]) { ++ if (!bind_to_group(tid)) { ++ fallback(); ++ return false; ++ } ++ } ++ return true; ++ } ++ auto &sched_affinity_group = m_sched_affinity_groups[thread_type]; ++ std::vector migrate_thread_num; ++ migrate_thread_num.resize(sched_affinity_group.size()); ++ count_migrate_thread_num(group_thread, sched_affinity_group, ++ migrate_thread_num); ++ if (!migrate_thread_and_setaffinity(group_thread, sched_affinity_group, ++ migrate_thread_num)) { ++ fallback(); ++ return false; ++ } ++ return true; ++} ++ ++bool Sched_affinity_manager_numa::reset_sched_affinity_info( ++ const char *cpu_string, const Thread_type &thread_type, ++ std::vector> &group_thread) { ++ bool numa_aware = m_numa_aware && thread_type == Thread_type::FOREGROUND; ++ group_thread.resize(numa_aware ? m_total_node_num : 1, std::set()); ++ for (const auto tid : m_thread_pid[thread_type]) { ++ const auto group_index = m_pid_group_id[tid]; ++ group_thread[group_index].insert(tid); ++ } ++ if (!init_sched_affinity_info( ++ cpu_string == nullptr ? std::string("") : std::string(cpu_string), ++ m_thread_bitmask[thread_type])) { ++ return false; ++ } ++ if (is_thread_sched_enabled(thread_type) && ++ !init_sched_affinity_group(m_thread_bitmask[thread_type], numa_aware, ++ m_sched_affinity_groups[thread_type])) { ++ return false; ++ } ++ return true; ++} ++ ++void Sched_affinity_manager_numa::count_migrate_thread_num( ++ const std::vector> &group_thread, ++ std::vector &sched_affinity_group, ++ std::vector &migrate_thread_num) { ++ int total_thread_num = 0; ++ int total_avail_cpu_num = 0; ++ for (auto i = 0u; i < sched_affinity_group.size(); ++i) { ++ total_thread_num += group_thread[i].size(); ++ total_avail_cpu_num += sched_affinity_group[i].avail_cpu_num; ++ } ++ if (total_avail_cpu_num == 0) { ++ for (auto i = 0u; i < sched_affinity_group.size(); ++i) { ++ sched_affinity_group[i].assigned_thread_num = 0; ++ migrate_thread_num[i] = 0; ++ } ++ return; ++ } ++ for (auto i = 0u; i < sched_affinity_group.size(); ++i) { ++ sched_affinity_group[i].assigned_thread_num = ++ std::ceil(static_cast(total_thread_num * ++ sched_affinity_group[i].avail_cpu_num) / ++ total_avail_cpu_num); ++ migrate_thread_num[i] = ++ sched_affinity_group[i].assigned_thread_num - group_thread[i].size(); ++ } ++} ++ ++bool Sched_affinity_manager_numa::migrate_thread_and_setaffinity( ++ const std::vector> &group_thread, ++ const std::vector &sched_affinity_group, ++ std::vector &migrate_thread_num) { ++ for (auto i = 0u; i < group_thread.size(); ++i) { ++ for (auto tid : group_thread[i]) { ++ if (sched_affinity_group[i].avail_cpu_num != 0 && ++ numa_sched_setaffinity( ++ tid, sched_affinity_group[i].avail_cpu_mask.get()) < 0) { ++ return false; ++ } ++ } ++ } ++ for (auto i = 0u; i < group_thread.size(); ++i) { ++ if (migrate_thread_num[i] >= 0) { ++ continue; ++ } ++ std::set::iterator it = group_thread[i].begin(); ++ for (auto j = 0u; j < group_thread.size(); ++j) { ++ while (migrate_thread_num[j] > 0 && migrate_thread_num[i] < 0 && ++ it != group_thread[i].end()) { ++ m_pid_group_id[*it] = j; ++ if (numa_sched_setaffinity( ++ *it, sched_affinity_group[j].avail_cpu_mask.get()) < 0) { ++ return false; ++ } ++ --migrate_thread_num[j]; ++ ++migrate_thread_num[i]; ++ ++it; ++ } ++ } ++ } ++ return true; ++} ++ ++bool Sched_affinity_manager_numa::is_thread_sched_enabled( ++ const Thread_type thread_type) { ++ auto it = m_thread_bitmask.find(thread_type); ++ return (it != m_thread_bitmask.end() && it->second) ? true : false; ++} ++ ++bool Sched_affinity_manager_numa::register_thread(const Thread_type thread_type, ++ const pid_t pid) { ++ const Lock_guard lock(m_mutex); ++ ++ if (m_is_fallback) { ++ LogErr(ERROR_LEVEL, ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED); ++ return m_fallback_delegate->register_thread(thread_type, pid); ++ } ++ ++ m_thread_pid[thread_type].insert(pid); ++ if (!bind_to_group(pid)) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_SET_THREAD_SCHED_AFFINIFY, ++ thread_type_names.at(thread_type).c_str()); ++ fallback(); ++ return false; ++ } ++ return true; ++} ++ ++bool Sched_affinity_manager_numa::unregister_thread(const pid_t pid) { ++ const Lock_guard lock(m_mutex); ++ ++ if (m_is_fallback) { ++ LogErr(ERROR_LEVEL, ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED); ++ return m_fallback_delegate->unregister_thread(pid); ++ } ++ ++ auto thread_type = get_thread_type_by_pid(pid); ++ if (thread_type == Thread_type::UNDEFINED) { ++ return false; ++ } ++ ++ if (!unbind_from_group(pid)) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_UNSET_THREAD_SCHED_AFFINIFY, ++ thread_type_names.at(thread_type).c_str()); ++ fallback(); ++ return false; ++ } ++ m_thread_pid[thread_type].erase(pid); ++ return true; ++} ++ ++Thread_type Sched_affinity_manager_numa::get_thread_type_by_pid( ++ const pid_t pid) { ++ for (const auto &thread_pid : m_thread_pid) { ++ if (thread_pid.second.find(pid) != thread_pid.second.end()) { ++ return thread_pid.first; ++ } ++ } ++ return Thread_type::UNDEFINED; ++} ++ ++bool Sched_affinity_manager_numa::bind_to_group(const pid_t pid) { ++ auto thread_type = get_thread_type_by_pid(pid); ++ if (thread_type == Thread_type::UNDEFINED) { ++ return false; ++ } ++ if (!is_thread_sched_enabled(thread_type)) { ++ return true; ++ } ++ auto &sched_affinity_group = m_sched_affinity_groups[thread_type]; ++ const int INVALID_INDEX = -1; ++ auto best_index = INVALID_INDEX; ++ for (auto i = 0u; i < sched_affinity_group.size(); ++i) { ++ if (sched_affinity_group[i].avail_cpu_num == 0) { ++ continue; ++ } ++ if (best_index == INVALID_INDEX || ++ sched_affinity_group[i].assigned_thread_num * ++ sched_affinity_group[best_index].avail_cpu_num < ++ sched_affinity_group[best_index].assigned_thread_num * ++ sched_affinity_group[i].avail_cpu_num) { ++ best_index = i; ++ } ++ } ++ ++ if (best_index == INVALID_INDEX) { ++ return false; ++ } ++ auto ret = numa_sched_setaffinity( ++ pid, sched_affinity_group[best_index].avail_cpu_mask.get()); ++ if (ret == 0) { ++ ++sched_affinity_group[best_index].assigned_thread_num; ++ m_pid_group_id[pid] = best_index; ++ return true; ++ } ++ return false; ++} ++ ++ ++bool Sched_affinity_manager_numa::unbind_from_group(const pid_t pid) { ++ auto thread_type = get_thread_type_by_pid(pid); ++ if (thread_type == Thread_type::UNDEFINED) { ++ return false; ++ } ++ if (!is_thread_sched_enabled(thread_type)) { ++ return true; ++ } ++ auto &sched_affinity_group = m_sched_affinity_groups[thread_type]; ++ auto index = m_pid_group_id.find(pid); ++ if (index == m_pid_group_id.end() || ++ index->second >= static_cast(sched_affinity_group.size())) { ++ return false; ++ } ++ --sched_affinity_group[index->second].assigned_thread_num; ++ m_pid_group_id.erase(index); ++ ++ return copy_affinity(pid, m_root_pid); ++} ++ ++bool Sched_affinity_manager_numa::copy_affinity(pid_t from, pid_t to) { ++ Bitmask_ptr to_bitmask(numa_allocate_cpumask()); ++ if (numa_sched_getaffinity(to, to_bitmask.get()) < 0) { ++ return false; ++ } ++ if (numa_sched_setaffinity(from, to_bitmask.get()) < 0) { ++ return false; ++ } ++ return true; ++} ++ ++std::string Sched_affinity_manager_numa::take_group_snapshot() { ++ const Lock_guard lock(m_mutex); ++ ++ if (m_is_fallback) { ++ LogErr(ERROR_LEVEL, ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED); ++ return m_fallback_delegate->take_group_snapshot(); ++ } ++ ++ std::string group_snapshot = ""; ++ for (const auto &thread_type : thread_types) { ++ if (!is_thread_sched_enabled(thread_type)) { ++ continue; ++ } ++ group_snapshot += thread_type_names.at(thread_type) + ": "; ++ for (const auto &sched_affinity_group : ++ m_sched_affinity_groups[thread_type]) { ++ group_snapshot += ++ (std::to_string(sched_affinity_group.assigned_thread_num) + ++ std::string("/") + ++ std::to_string(sched_affinity_group.avail_cpu_num) + ++ std::string("; ")); ++ } ++ } ++ return group_snapshot; ++} ++ ++int Sched_affinity_manager_numa::get_total_node_number() { ++ return m_total_node_num; ++} ++ ++int Sched_affinity_manager_numa::get_cpu_number_per_node() { ++ return m_cpu_num_per_node; ++} ++ ++bool Sched_affinity_manager_numa::check_cpu_string( ++ const std::string &cpu_string) { ++ auto ret = normalize_cpu_string(cpu_string); ++ if (!ret.second) { ++ return false; ++ } ++ Bitmask_ptr bitmask(numa_parse_cpustring(ret.first.c_str())); ++ return bitmask.get() != nullptr; ++} ++ ++std::pair Sched_affinity_manager_numa::normalize_cpu_string( ++ const std::string &cpu_string) { ++ std::string normalized_cpu_string = ""; ++ bool invalid_cpu_string = false; ++ const int INVALID_CORE_ID = -1; ++ int core_id = INVALID_CORE_ID; ++ for (auto c : cpu_string) { ++ switch (c) { ++ case ' ': ++ break; ++ case '-': ++ case ',': ++ if (core_id == INVALID_CORE_ID) { ++ invalid_cpu_string = true; ++ } else { ++ normalized_cpu_string += std::to_string(core_id); ++ normalized_cpu_string += c; ++ core_id = INVALID_CORE_ID; ++ } ++ break; ++ case '0' ... '9': ++ if (core_id == INVALID_CORE_ID) { ++ core_id = (c - '0'); ++ } else { ++ core_id = core_id * 10 + (c - '0'); ++ } ++ break; ++ default: ++ invalid_cpu_string = true; ++ break; ++ } ++ if (invalid_cpu_string) { ++ break; ++ } ++ } ++ if (core_id != INVALID_CORE_ID) { ++ normalized_cpu_string += std::to_string(core_id); ++ } ++ if (!normalized_cpu_string.empty() && ++ (*normalized_cpu_string.rbegin() == '-' || ++ *normalized_cpu_string.rbegin() == ',')) { ++ invalid_cpu_string = true; ++ } ++ if (invalid_cpu_string) { ++ return std::make_pair(std::string(), false); ++ } ++ return std::make_pair(normalized_cpu_string, true); ++} ++ ++bool Sched_affinity_manager_numa::update_numa_aware(bool numa_aware) { ++ const Lock_guard lock(m_mutex); ++ if (m_is_fallback) { ++ LogErr(ERROR_LEVEL, ER_FALLBACK_DELEGATE_SCHED_AFFINITY_MANAGER_IS_CALLED); ++ return m_fallback_delegate->update_numa_aware(numa_aware); ++ } ++ if (m_numa_aware == numa_aware) { ++ return true; ++ } ++ std::vector pending_pids; ++ pending_pids.resize(m_pid_group_id.size()); ++ std::transform(m_pid_group_id.begin(), m_pid_group_id.end(), ++ pending_pids.begin(), ++ [](auto &pid_group_id) { return pid_group_id.first; }); ++ for (const auto &pending_pid : pending_pids) { ++ if (!unbind_from_group(pending_pid)) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_UNSET_THREAD_SCHED_AFFINIFY, ++ thread_type_names.at(get_thread_type_by_pid(pending_pid)).c_str()); ++ fallback(); ++ return false; ++ } ++ } ++ m_numa_aware = numa_aware; ++ for (const auto &thread_type : thread_types) { ++ if (is_thread_sched_enabled(thread_type) && ++ !init_sched_affinity_group( ++ m_thread_bitmask[thread_type], ++ m_numa_aware && thread_type == Thread_type::FOREGROUND, ++ m_sched_affinity_groups[thread_type])) { ++ fallback(); ++ return false; ++ } ++ } ++ for (const auto &pending_pid : pending_pids) { ++ if (!bind_to_group(pending_pid)) { ++ LogErr(ERROR_LEVEL, ER_CANNOT_SET_THREAD_SCHED_AFFINIFY, ++ thread_type_names.at(get_thread_type_by_pid(pending_pid)).c_str()); ++ fallback(); ++ return false; ++ } ++ } ++ return true; ++} ++} // namespace sched_affinity ++#endif /* HAVE_LIBNUMA */ ++ ++namespace sched_affinity { ++static Sched_affinity_manager *sched_affinity_manager = nullptr; ++Sched_affinity_manager *Sched_affinity_manager::create_instance( ++ const std::map &sched_affinity_parameter, ++ bool numa_aware) { ++ Sched_affinity_manager::free_instance(); ++#ifdef HAVE_LIBNUMA ++ if (numa_available() == -1) { ++ LogErr(WARNING_LEVEL, ER_NUMA_AVAILABLE_TEST_FAIL); ++ LogErr(INFORMATION_LEVEL, ER_USE_DUMMY_SCHED_AFFINITY_MANAGER); ++ sched_affinity_manager = new Sched_affinity_manager_dummy(); ++ } else { ++ sched_affinity_manager = new Sched_affinity_manager_numa(); ++ } ++#else ++ LogErr(WARNING_LEVEL, ER_LIBNUMA_TEST_FAIL); ++ LogErr(INFORMATION_LEVEL, ER_USE_DUMMY_SCHED_AFFINITY_MANAGER); ++ sched_affinity_manager = new Sched_affinity_manager_dummy(); ++#endif /* HAVE_LIBNUMA */ ++ if (!sched_affinity_manager->init(sched_affinity_parameter, numa_aware)) { ++ return nullptr; ++ } ++ return sched_affinity_manager; ++} ++ ++Sched_affinity_manager *Sched_affinity_manager::get_instance() { ++ return sched_affinity_manager; ++} ++ ++void Sched_affinity_manager::free_instance() { ++ if (sched_affinity_manager != nullptr) { ++ delete sched_affinity_manager; ++ sched_affinity_manager = nullptr; ++ } ++} ++ ++pid_t gettid() { return static_cast(syscall(SYS_gettid)); } ++} // namespace sched_affinity ++ +diff --git a/sql/sched_affinity_manager.h b/sql/sched_affinity_manager.h +new file mode 100644 +index 00000000..9c12cd3b +--- /dev/null ++++ b/sql/sched_affinity_manager.h +@@ -0,0 +1,217 @@ ++/***************************************************************************** ++Copyright (c) 2022, Huawei Technologies Co., Ltd. All Rights Reserved. ++This program is free software; you can redistribute it and/or modify it under ++the terms of the GNU General Public License, version 2.0, as published by the ++Free Software Foundation. ++This program is distributed in the hope that it will be useful, but WITHOUT ++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS ++FOR A PARTICULAR PURPOSE. See the GNU General Public License, version 2.0, ++for more details. ++*****************************************************************************/ ++ ++#ifndef SCHED_AFFINITY_MANAGER_H ++#define SCHED_AFFINITY_MANAGER_H ++#include "my_config.h" ++#ifdef HAVE_LIBNUMA ++#include ++#endif ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef GMOCK_FOUND ++#include "gtest/gtest_prod.h" ++#endif ++ ++#include "mysql/psi/mysql_mutex.h" ++ ++namespace sched_affinity { ++enum class Thread_type { ++ FOREGROUND, ++ LOG_WRITER, ++ LOG_FLUSHER, ++ LOG_WRITE_NOTIFIER, ++ LOG_FLUSH_NOTIFIER, ++ LOG_CHECKPOINTER, ++ PURGE_COORDINATOR, ++ UNDEFINED ++}; ++ ++extern const std::vector thread_types; ++extern const std::map thread_type_names; ++ ++pid_t gettid(); ++ ++class Sched_affinity_manager { ++ public: ++ virtual ~Sched_affinity_manager(){}; ++ static Sched_affinity_manager *create_instance( ++ const std::map &, bool numa_aware); ++ static Sched_affinity_manager *get_instance(); ++ static void free_instance(); ++ virtual bool register_thread(const Thread_type thread_type, ++ const pid_t pid) = 0; ++ virtual bool unregister_thread(const pid_t pid) = 0; ++ virtual bool rebalance_group(const char *cpu_string, ++ const Thread_type thread_type) = 0; ++ virtual bool update_numa_aware(bool numa_aware) = 0; ++ virtual std::string take_group_snapshot() = 0; ++ virtual int get_total_node_number() = 0; ++ virtual int get_cpu_number_per_node() = 0; ++ virtual bool check_cpu_string(const std::string &cpu_string) = 0; ++ ++ protected: ++ virtual bool init(const std::map &, ++ bool numa_aware) = 0; ++}; ++ ++class Sched_affinity_manager_dummy : public Sched_affinity_manager { ++ public: ++ Sched_affinity_manager_dummy(const Sched_affinity_manager_dummy &) = delete; ++ Sched_affinity_manager_dummy &operator=( ++ const Sched_affinity_manager_dummy &) = delete; ++ Sched_affinity_manager_dummy(Sched_affinity_manager_dummy &&) = delete; ++ Sched_affinity_manager_dummy &operator=(Sched_affinity_manager_dummy &&) = ++ delete; ++ bool register_thread(const Thread_type, const pid_t) override { return true; } ++ bool unregister_thread(const pid_t) override { return true; } ++ bool rebalance_group(const char *, const Thread_type) override { ++ return true; ++ } ++ bool update_numa_aware(bool) override { return true; } ++ std::string take_group_snapshot() override { return std::string(); } ++ int get_total_node_number() override { return -1; } ++ int get_cpu_number_per_node() override { return -1; } ++ bool check_cpu_string(const std::string &) override { return true; } ++ ++ private: ++ Sched_affinity_manager_dummy() : Sched_affinity_manager(){}; ++ ~Sched_affinity_manager_dummy() override{}; ++ bool init(const std::map &, bool) override { ++ return true; ++ } ++ friend class Sched_affinity_manager; ++ friend class Sched_affinity_manager_numa; ++ ++#ifdef FRIEND_TEST ++ FRIEND_TEST(SchedAffinityManagerDummyTest, Implementation); ++#endif ++}; ++ ++#ifdef HAVE_LIBNUMA ++ ++struct Bitmask_deleter { ++ void operator()(bitmask *ptr) { ++ if (ptr != nullptr) { ++ numa_free_cpumask(ptr); ++ } ++ } ++}; ++ ++using Bitmask_ptr = std::unique_ptr; ++ ++struct Sched_affinity_group { ++ Bitmask_ptr avail_cpu_mask; ++ int avail_cpu_num; ++ int assigned_thread_num; ++}; ++ ++class Sched_affinity_manager_numa : public Sched_affinity_manager { ++ public: ++ Sched_affinity_manager_numa(const Sched_affinity_manager_numa &) = delete; ++ Sched_affinity_manager_numa &operator=(const Sched_affinity_manager_numa &) = ++ delete; ++ Sched_affinity_manager_numa(Sched_affinity_manager_numa &&) = delete; ++ Sched_affinity_manager_numa &operator=(Sched_affinity_manager_numa &&) = ++ delete; ++ ++ bool register_thread(const Thread_type thread_type, const pid_t pid) override; ++ bool unregister_thread(const pid_t pid) override; ++ bool rebalance_group(const char *cpu_string, ++ const Thread_type thread_type) override; ++ bool update_numa_aware(bool numa_aware) override; ++ std::string take_group_snapshot() override; ++ int get_total_node_number() override; ++ int get_cpu_number_per_node() override; ++ bool check_cpu_string(const std::string &cpu_string) override; ++ ++ private: ++ Sched_affinity_manager_numa(); ++ ~Sched_affinity_manager_numa() override; ++ bool init(const std::map &, bool) override; ++ bool init_sched_affinity_info(const std::string &cpu_string, ++ Bitmask_ptr &group_bitmask); ++ bool init_sched_affinity_group( ++ const Bitmask_ptr &group_bitmask, const bool numa_aware, ++ std::vector &sched_affinity_group); ++ bool is_thread_sched_enabled(const Thread_type thread_type); ++ bool bind_to_group(const pid_t pid); ++ bool unbind_from_group(const pid_t pid); ++ ++ bool copy_affinity(pid_t from, pid_t to); ++ bool reset_sched_affinity_info(const char *cpu_string, const Thread_type &, ++ std::vector> &); ++ void count_migrate_thread_num(const std::vector> &, ++ std::vector &, ++ std::vector &); ++ bool migrate_thread_and_setaffinity(const std::vector> &, ++ const std::vector &, ++ std::vector &); ++ Thread_type get_thread_type_by_pid(const pid_t pid); ++ static std::pair normalize_cpu_string( ++ const std::string &cpu_string); ++ /** ++ The sched_affinity_manager_numa instance's internal state may become ++ inconsistent due to some previous failure, e.g. libnuma return error. Call ++ fallback() to use a fallback_delegate to serve further request to ++ sched_affinity_manager_numa instance's public interface. This method should be ++ called under the protection of m_mutex. ++ */ ++ void fallback(); ++ ++ private: ++ int m_total_cpu_num; ++ int m_total_node_num; ++ int m_cpu_num_per_node; ++ bool m_numa_aware; ++ pid_t m_root_pid; ++ bool m_is_fallback; ++ std::unique_ptr m_fallback_delegate; ++ std::map> ++ m_sched_affinity_groups; ++ std::map m_thread_bitmask; ++ std::map> m_thread_pid; ++ std::map m_pid_group_id; ++ mysql_mutex_t m_mutex; ++ ++ friend class Sched_affinity_manager; ++ ++#ifdef FRIEND_TEST ++ FRIEND_TEST(SchedAffinityManagerTest, InitSchedAffinityInfo); ++ FRIEND_TEST(SchedAffinityManagerTest, InitSchedAffinityGroup); ++ FRIEND_TEST(SchedAffinityManagerTest, NormalizeCpuString); ++ FRIEND_TEST(SchedAffinityManagerTest, BindToGroup); ++ FRIEND_TEST(SchedAffinityManagerTest, UnbindFromGroup); ++ FRIEND_TEST(SchedAffinityManagerTest, GetThreadTypeByPid); ++ FRIEND_TEST(SchedAffinityManagerTest, RegisterThread); ++ FRIEND_TEST(SchedAffinityManagerTest, UnregisterThread); ++ FRIEND_TEST(SchedAffinityManagerTest, NumaAwareDisabled); ++ FRIEND_TEST(SchedAffinityManagerTest, NumaAwareEnabled); ++ FRIEND_TEST(SchedAffinityManagerTest, RebalanceGroup); ++ FRIEND_TEST(SchedAffinityManagerTest, IsThreadSchedEnabled); ++ FRIEND_TEST(SchedAffinityManagerTest, UpdateNumaAware); ++ FRIEND_TEST(SchedAffinityManagerTest, AllNullptrConfig); ++ FRIEND_TEST(SchedAffinityManagerTest, EmptyStringConfig); ++ FRIEND_TEST(SchedAffinityManagerTest, EmptyContainerConfig); ++ FRIEND_TEST(SchedAffinityManagerTest, Fallback); ++#endif ++}; ++#endif /* HAVE_LIBNUMA */ ++} // namespace sched_affinity ++#endif /* SCHED_AFFINITY_MANAGER_H */ +diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc +index 3b8473bd..5f55b972 100644 +--- a/sql/sys_vars.cc ++++ b/sql/sys_vars.cc +@@ -119,6 +119,7 @@ + #include "sql/rpl_rli.h" // Relay_log_info + #include "sql/rpl_slave.h" // SLAVE_THD_TYPE + #include "sql/rpl_write_set_handler.h" // transaction_write_set_hashing_algorithms ++#include "sql/sched_affinity_manager.h" + #include "sql/server_component/log_builtins_filter_imp.h" // until we have pluggable variables + #include "sql/server_component/log_builtins_imp.h" + #include "sql/session_tracker.h" +@@ -1382,6 +1383,174 @@ static bool check_binlog_trx_compression(sys_var *self MY_ATTRIBUTE((unused)), + return false; + } + ++bool sched_affinity_numa_aware = false; ++ ++static bool on_sched_affinity_numa_aware_update(sys_var *, THD *, enum_var_type) ++{ ++ if (sched_affinity::Sched_affinity_manager::get_instance() != nullptr && ++ !sched_affinity::Sched_affinity_manager::get_instance() ++ ->update_numa_aware(sched_affinity_numa_aware)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_NUMA_AWARE, MYF(0)); ++ return true; ++ } ++ return false; ++} ++ ++Sys_var_bool Sys_sched_affinity_numa_aware( ++ "sched_affinity_numa_aware", ++ "Schedule threads with numa information", ++ GLOBAL_VAR(sched_affinity_numa_aware), CMD_LINE(OPT_ARG), ++ DEFAULT(false), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(nullptr), ON_UPDATE(on_sched_affinity_numa_aware_update)); ++ ++std::map sched_affinity_parameter = { ++ {sched_affinity::Thread_type::FOREGROUND, nullptr}, ++ {sched_affinity::Thread_type::LOG_WRITER, nullptr}, ++ {sched_affinity::Thread_type::LOG_FLUSHER, nullptr}, ++ {sched_affinity::Thread_type::LOG_WRITE_NOTIFIER, nullptr}, ++ {sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER, nullptr}, ++ {sched_affinity::Thread_type::LOG_CHECKPOINTER, nullptr}, ++ {sched_affinity::Thread_type::PURGE_COORDINATOR, nullptr}}; ++ ++static bool check_sched_affinity_parameter(sys_var *, THD *, set_var *var) { ++ char *c = var->save_result.string_value.str; ++ if (sched_affinity::Sched_affinity_manager::get_instance() != nullptr && ++ c != nullptr && ++ !sched_affinity::Sched_affinity_manager::get_instance()->check_cpu_string( ++ std::string(c))) { ++ my_error(ER_INVALID_CPU_STRING, MYF(0), c); ++ return true; ++ } ++ return false; ++} ++ ++static bool on_sched_affinity_foreground_thread_update(sys_var *, THD *, ++ enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::FOREGROUND], ++ sched_affinity::Thread_type::FOREGROUND)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::FOREGROUND).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_foreground_thread( ++ "sched_affinity_foreground_thread", ++ "The set of cpus which foreground threads will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::FOREGROUND]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_foreground_thread_update)); ++ ++static bool on_sched_affinity_log_writer_update(sys_var *, THD *, ++ enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::LOG_WRITER], ++ sched_affinity::Thread_type::LOG_WRITER)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::LOG_WRITER).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_log_writer( ++ "sched_affinity_log_writer", ++ "The set of cpus which log writer thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::LOG_WRITER]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_log_writer_update)); ++ ++static bool on_sched_affinity_log_flusher_update(sys_var *, THD *, enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::LOG_FLUSHER], ++ sched_affinity::Thread_type::LOG_FLUSHER)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::LOG_FLUSHER).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_log_flusher( ++ "sched_affinity_log_flusher", ++ "The set of cpus which log flusher thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::LOG_FLUSHER]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_log_flusher_update)); ++ ++static bool on_sched_affinity_log_write_notifier_update(sys_var *, THD *, enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::LOG_WRITE_NOTIFIER], ++ sched_affinity::Thread_type::LOG_WRITE_NOTIFIER)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::LOG_WRITE_NOTIFIER).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_log_write_notifier( ++ "sched_affinity_log_write_notifier", ++ "The set of cpus which log write notifier thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::LOG_WRITE_NOTIFIER]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_log_write_notifier_update)); ++ ++static bool on_sched_affinity_log_flush_notifier_update(sys_var *, THD *, enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER], ++ sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_log_flush_notifier( ++ "sched_affinity_log_flush_notifier", ++ "The set of cpus which log flush notifier thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_log_flush_notifier_update)); ++ ++static bool on_sched_affinity_log_checkpointer_update(sys_var *, THD *, enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::LOG_CHECKPOINTER], ++ sched_affinity::Thread_type::LOG_CHECKPOINTER)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::LOG_CHECKPOINTER).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_log_checkpointer( ++ "sched_affinity_log_checkpointer", ++ "The set of cpus which log checkpointer thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::LOG_CHECKPOINTER]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_log_checkpointer_update)); ++ ++static bool on_sched_affinity_purge_coordinator_update(sys_var *, THD *, enum_var_type) { ++ if (!sched_affinity::Sched_affinity_manager::get_instance()->rebalance_group( ++ sched_affinity_parameter[sched_affinity::Thread_type::PURGE_COORDINATOR], ++ sched_affinity::Thread_type::PURGE_COORDINATOR)) { ++ my_error(ER_CANNOT_UPDATE_SCHED_AFFINITY_PARAMETER, MYF(0), ++ sched_affinity::thread_type_names.at(sched_affinity::Thread_type::PURGE_COORDINATOR).c_str()); ++ return true; ++ } ++ return false; ++} ++ ++static Sys_var_charptr Sys_sched_affinity_purge_coordinator( ++ "sched_affinity_purge_coordinator", ++ "The set of cpus which purge coordinator thread will run on.", ++ GLOBAL_VAR(sched_affinity_parameter[sched_affinity::Thread_type::PURGE_COORDINATOR]), CMD_LINE(REQUIRED_ARG), ++ IN_FS_CHARSET, DEFAULT(nullptr), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_sched_affinity_parameter), ++ ON_UPDATE(on_sched_affinity_purge_coordinator_update)); ++ + static Sys_var_bool Sys_binlog_trx_compression( + "binlog_transaction_compression", + "Whether to compress transactions or not. Transactions are compressed " +diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt +index fe8b6a78..77ec80ab 100644 +--- a/storage/innobase/CMakeLists.txt ++++ b/storage/innobase/CMakeLists.txt +@@ -33,8 +33,11 @@ ADD_DEFINITIONS(-DPFS_DIRECT_CALL) + INCLUDE_DIRECTORIES( + ${CMAKE_SOURCE_DIR}/sql + ${CMAKE_SOURCE_DIR}/sql/auth ++ ${GMOCK_INCLUDE_DIRS} + ) + ++INCLUDE_DIRECTORIES(${GMOCK_INCLUDE_DIRS}) ++ + # Conflicting YYSTYPE, because we have multiple Bison grammars. + # WL#11100 Migrate to Bison 3.x should fix this. + # +diff --git a/storage/innobase/log/log0chkp.cc b/storage/innobase/log/log0chkp.cc +index b4b8f12f..082f7980 100644 +--- a/storage/innobase/log/log0chkp.cc ++++ b/storage/innobase/log/log0chkp.cc +@@ -50,6 +50,8 @@ the file COPYING.Google. + #include + #endif /* !UNIV_HOTBACKUP */ + ++#include ++ + #include "arch0arch.h" + #include "buf0buf.h" + #include "buf0flu.h" +@@ -59,6 +61,7 @@ the file COPYING.Google. + #include "log0log.h" + #include "log0recv.h" + #include "mem0mem.h" ++#include "sql/sched_affinity_manager.h" + #include "srv0mon.h" + #include "srv0srv.h" + #include "srv0start.h" +@@ -1026,6 +1029,17 @@ static void log_consider_checkpoint(log_t &log) { + } + + void log_checkpointer(log_t *log_ptr) { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::LOG_CHECKPOINTER, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "log_checkpointer"; ++ } ++ + ut_a(log_ptr != nullptr); + + log_t &log = *log_ptr; +@@ -1117,6 +1131,11 @@ void log_checkpointer(log_t *log_ptr) { + /* We prefer to wait until all writing is done. */ + } + } ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "log_checkpointer"; ++ } + } + + /** @} */ +diff --git a/storage/innobase/log/log0write.cc b/storage/innobase/log/log0write.cc +index 37e5b0f2..6b4572b3 100644 +--- a/storage/innobase/log/log0write.cc ++++ b/storage/innobase/log/log0write.cc +@@ -49,6 +49,8 @@ the file COPYING.Google. + + #include + ++#include ++ + #include "arch0arch.h" + #include "buf0buf.h" + #include "buf0flu.h" +@@ -62,6 +64,7 @@ the file COPYING.Google. + #include "log0recv.h" + #include "mem0mem.h" + #include "mysqld.h" /* server_uuid */ ++#include "sql/sched_affinity_manager.h" + #include "srv0mon.h" + #include "srv0srv.h" + #include "srv0start.h" +@@ -2142,6 +2145,17 @@ static void log_writer_write_buffer(log_t &log, lsn_t next_write_lsn) { + } + + void log_writer(log_t *log_ptr) { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ auto pid = sched_affinity::gettid(); ++ bool is_registered_to_sched_affinity = false; ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::LOG_WRITER, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "log_writer"; ++ } ++ + ut_a(log_ptr != nullptr); + + log_t &log = *log_ptr; +@@ -2239,6 +2253,13 @@ void log_writer(log_t *log_ptr) { + } + + log_writer_mutex_exit(log); ++ ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "log_writer"; ++ } ++ + } + + /** @} */ +@@ -2412,6 +2433,17 @@ static void log_flush_low(log_t &log) { + } + + void log_flusher(log_t *log_ptr) { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::LOG_FLUSHER, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "log_flusher"; ++ } ++ + ut_a(log_ptr != nullptr); + + log_t &log = *log_ptr; +@@ -2541,6 +2573,13 @@ void log_flusher(log_t *log_ptr) { + ut_a(log.write_lsn.load() == log.flushed_to_disk_lsn.load()); + + log_flusher_mutex_exit(log); ++ ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "log_flusher"; ++ } ++ + } + + /** @} */ +@@ -2554,6 +2593,17 @@ void log_flusher(log_t *log_ptr) { + /** @{ */ + + void log_write_notifier(log_t *log_ptr) { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::LOG_WRITE_NOTIFIER, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "log_write_notifier"; ++ } ++ + ut_a(log_ptr != nullptr); + + log_t &log = *log_ptr; +@@ -2659,6 +2709,13 @@ void log_write_notifier(log_t *log_ptr) { + } + + log_write_notifier_mutex_exit(log); ++ ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "log_write_notifier"; ++ } ++ + } + + /** @} */ +@@ -2672,6 +2729,17 @@ void log_write_notifier(log_t *log_ptr) { + /** @{ */ + + void log_flush_notifier(log_t *log_ptr) { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::LOG_FLUSH_NOTIFIER, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "log_flush_notifier"; ++ } ++ + ut_a(log_ptr != nullptr); + + log_t &log = *log_ptr; +@@ -2777,6 +2845,13 @@ void log_flush_notifier(log_t *log_ptr) { + } + + log_flush_notifier_mutex_exit(log); ++ ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "log_flush_notifier"; ++ } ++ + } + + /** @} */ +diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc +index 40cea6d8..2f5071cd 100644 +--- a/storage/innobase/srv/srv0srv.cc ++++ b/storage/innobase/srv/srv0srv.cc +@@ -49,6 +49,7 @@ this program; if not, write to the Free Software Foundation, Inc., + #include + #include + #include ++#include + + #include + +@@ -72,6 +73,7 @@ this program; if not, write to the Free Software Foundation, Inc., + #include "pars0pars.h" + #include "que0que.h" + #include "row0mysql.h" ++#include "sql/sched_affinity_manager.h" + #include "sql_thd_internal_api.h" + #include "srv0mon.h" + +@@ -3154,6 +3156,17 @@ static void srv_purge_coordinator_suspend( + + /** Purge coordinator thread that schedules the purge tasks. */ + void srv_purge_coordinator_thread() { ++ auto sched_affinity_manager = sched_affinity::Sched_affinity_manager::get_instance(); ++ bool is_registered_to_sched_affinity = false; ++ auto pid = sched_affinity::gettid(); ++ if (sched_affinity_manager != nullptr && ++ !(is_registered_to_sched_affinity = ++ sched_affinity_manager->register_thread( ++ sched_affinity::Thread_type::PURGE_COORDINATOR, pid))) { ++ ib::error(ER_CANNOT_REGISTER_THREAD_TO_SCHED_AFFINIFY_MANAGER) ++ << "purge_coordinator"; ++ } ++ + srv_slot_t *slot; + + #ifdef UNIV_PFS_THREAD +@@ -3272,6 +3285,12 @@ void srv_purge_coordinator_thread() { + srv_thread_delay_cleanup_if_needed(false); + + destroy_thd(thd); ++ ++ if (is_registered_to_sched_affinity && ++ !sched_affinity_manager->unregister_thread(pid)) { ++ ib::error(ER_CANNOT_UNREGISTER_THREAD_FROM_SCHED_AFFINIFY_MANAGER) ++ << "purge_coordinator"; ++ } + } + + /** Enqueues a task to server task queue and releases a worker thread, if there diff --git a/KunpengBoostKit22.0.RC4-CODE-THREADPOOL-FOR-MySQL-8.0.25.patch b/KunpengBoostKit22.0.RC4-CODE-THREADPOOL-FOR-MySQL-8.0.25.patch new file mode 100644 index 0000000000000000000000000000000000000000..03541ab2b57ae12f0e8ae32e5c6ed2f9dc8cf5a7 --- /dev/null +++ b/KunpengBoostKit22.0.RC4-CODE-THREADPOOL-FOR-MySQL-8.0.25.patch @@ -0,0 +1,2962 @@ +diff --git a/plugin/thread_pool/CMakeLists.txt b/plugin/thread_pool/CMakeLists.txt +new file mode 100644 +index 00000000000..35cbdff5140 +--- /dev/null ++++ b/plugin/thread_pool/CMakeLists.txt +@@ -0,0 +1,26 @@ ++# Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. ++# Copyright (c) 2022 Huawei Technologies Co., Ltd. ++# ++# This program is free software; you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation; version 2 of the License. ++# ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++# ++# You should have received a copy of the GNU General Public License ++# along with this program; if not, write to the Free Software ++# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ++ ++ADD_COMPILE_DEFINITIONS( ++ COMPILE_DEFINITIONS MYSQL_DYNAMIC_PLUGIN) ++ ++MYSQL_ADD_PLUGIN(thread_pool ++ threadpool_common.cc ++ threadpool_unix.cc ++ MODULE_ONLY ++ MODULE_OUTPUT_NAME "thread_pool" ++ ) ++ +diff --git a/plugin/thread_pool/numa_affinity_manager.h b/plugin/thread_pool/numa_affinity_manager.h +new file mode 100644 +index 00000000000..3471d328736 +--- /dev/null ++++ b/plugin/thread_pool/numa_affinity_manager.h +@@ -0,0 +1,117 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#ifndef NUMA_AFFINITY_MANAGER_H_ ++#define NUMA_AFFINITY_MANAGER_H_ ++ ++#include ++#include ++#include ++#include ++#include ++ ++using namespace std; ++ ++class numa_affinity_manager ++{ ++public: ++ numa_affinity_manager(){}; ++ virtual ~numa_affinity_manager(){}; ++ ++ bool init() { ++ initok = false; ++ cpu_count = get_sys_cpu(); ++ numa_count = get_sys_numa(); ++ if (cpu_count <= 0 || numa_count <= 0 || ++ cpu_count % numa_count != 0) { ++ return false; ++ } ++ ++ int cpu_per_numa = cpu_count / numa_count; ++ int start = 0; ++ numa_cpu_map.clear(); ++ auto delete_cpumask = [](bitmask *ptr) { ++ if (ptr != nullptr) { ++ numa_free_cpumask(ptr); ++ } ++ }; ++ for (int i = 0; i < numa_count; i++) { ++ auto msk = numa_allocate_cpumask(); ++ if (msk == nullptr) { ++ return false; ++ } ++ ++ for (int j = 0; j < cpu_per_numa; j++) { ++ numa_bitmask_setbit(msk, start + j); ++ } ++ numa_cpu_map.emplace_back(msk, delete_cpumask); ++ start += cpu_per_numa; ++ } ++ initok = true; ++ return true; ++ } ++ ++ bool bind_numa(int group_id) { ++ if (initok) { ++ pid_t pid = gettid(); ++ return (numa_sched_setaffinity( ++ pid, numa_cpu_map[group_id%numa_cpu_map.size()].get()) == 0); ++ } ++ ++ return false; ++ } ++ ++protected: ++ int get_sys_cpu() { ++ return numa_num_configured_cpus(); ++ } ++ ++ int get_sys_numa() { ++ return numa_num_configured_nodes(); ++ } ++ ++ pid_t gettid() { ++ return static_cast(syscall(SYS_gettid)); ++ } ++ ++public: ++ void print_cpumask(const string &name, bitmask *msk) { ++ cout << name << ": "; ++ for (unsigned int i = 0; i < msk->size; i++) { ++ if (numa_bitmask_isbitset(msk, i)) { ++ cout << i << " "; ++ } ++ } ++ cout << endl; ++ } ++ void dump() { ++ cout << "initok: " << initok << endl; ++ cout << "cpu_count: " << cpu_count << endl; ++ cout << "numa_count: " << numa_count << endl; ++ ++ for (unsigned int i = 0; i < numa_cpu_map.size(); i++) { ++ string name = "numa_cpu_map[" + to_string(i) + "]"; ++ print_cpumask(name, numa_cpu_map[i].get()); ++ } ++ } ++ ++private: ++ bool initok{false}; ++ int cpu_count{0}; ++ int numa_count{0}; ++ vector> numa_cpu_map; ++}; ++ ++#endif // NUMA_AFFINITY_MANAGER_H_ +diff --git a/plugin/thread_pool/threadpool.h b/plugin/thread_pool/threadpool.h +new file mode 100644 +index 00000000000..f4dd68dc8a9 +--- /dev/null ++++ b/plugin/thread_pool/threadpool.h +@@ -0,0 +1,89 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#ifndef THREADPOOL_H_ ++#define THREADPOOL_H_ ++ ++#include "sql/sql_class.h" ++#include "sql/mysqld_thd_manager.h" ++#include "sql/conn_handler/connection_handler_manager.h" ++#include "sql/conn_handler/channel_info.h" ++ ++struct SHOW_VAR; ++ ++#define MAX_THREAD_GROUPS 1024 ++#define MAX_CONNECTIONS 100000 ++ ++ ++enum tp_high_prio_mode_t { ++ TP_HIGH_PRIO_MODE_TRANSACTIONS, ++ TP_HIGH_PRIO_MODE_STATEMENTS, ++ TP_HIGH_PRIO_MODE_NONE ++}; ++ ++/* Threadpool parameters */ ++extern uint threadpool_idle_timeout; /* Shutdown idle worker threads after this timeout */ ++extern bool threadpool_dedicated_listener; /* Control whether listener be dedicated */ ++extern uint threadpool_size; /* Number of parallel executing threads */ ++extern bool threadpool_sched_affinity; /* Control whether thread group scheduling affinity */ ++extern uint threadpool_max_threads; ++extern uint threadpool_stall_limit; /* time interval in 10 ms units for stall checks*/ ++extern uint threadpool_oversubscribe; /* Maximum active threads in group */ ++extern uint threadpool_toobusy; /* Maximum active and waiting threads in group */ ++ ++/* Possible values for thread_pool_high_prio_mode */ ++extern const char *threadpool_high_prio_mode_names[]; ++ ++/* Common thread pool routines, suitable for different implementations */ ++extern void threadpool_remove_connection(THD *thd); ++extern int threadpool_process_request(THD *thd); ++extern int threadpool_add_connection(THD *thd); ++ ++/* ++ Functions used by scheduler. ++ OS-specific implementations are in ++ threadpool_unix.cc or threadpool_win.cc ++*/ ++extern bool tp_init(); ++extern void tp_wait_begin(THD *, int); ++extern void tp_wait_end(THD *); ++extern void tp_post_kill_notification(THD *thd) noexcept; ++extern bool tp_add_connection(Channel_info *); ++extern void tp_end(void); ++extern void tp_fake_end(void); ++extern void threadpool_remove_connection(THD *thd); ++extern bool thread_attach(THD *thd); ++ ++extern THD_event_functions tp_event_functions; ++ ++/* ++ Threadpool statistics ++*/ ++struct TP_STATISTICS { ++ /* Current number of worker thread. */ ++ std::atomic num_worker_threads; ++}; ++ ++extern TP_STATISTICS tp_stats; ++ ++/* Functions to set threadpool parameters */ ++extern void tp_set_threadpool_size(uint val) noexcept; ++extern void tp_set_threadpool_stall_limit(uint val) noexcept; ++ ++extern uint tp_get_thdvar_high_prio_tickets(THD *thd); ++extern uint tp_get_thdvar_high_prio_mode(THD *thd); ++ ++#endif // THREADPOOL_H_ ++ +diff --git a/plugin/thread_pool/threadpool_common.cc b/plugin/thread_pool/threadpool_common.cc +new file mode 100644 +index 00000000000..00595fc4b3f +--- /dev/null ++++ b/plugin/thread_pool/threadpool_common.cc +@@ -0,0 +1,765 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++ ++#include "threadpool.h" ++#include "threadpool_unix.h" ++#include "my_thread_local.h" ++#include "my_sys.h" ++#include "mysql/plugin.h" ++#include "mysql/psi/mysql_idle.h" ++#include "mysql/thread_pool_priv.h" ++#include "sql/debug_sync.h" ++#include "sql/mysqld.h" ++#include "sql/sql_class.h" ++#include "sql/sql_connect.h" ++#include "sql/protocol_classic.h" ++#include "sql/sql_parse.h" ++#include "sql/sql_table.h" ++#include "sql/field.h" ++#include "sql/sql_show.h" ++#include "sql/sql_class.h" ++#include ++#include ++ ++#define MYSQL_SERVER 1 ++ ++/* Threadpool parameters */ ++uint threadpool_idle_timeout; ++bool threadpool_dedicated_listener; ++uint threadpool_size; ++bool threadpool_sched_affinity; ++uint threadpool_stall_limit; ++uint threadpool_max_threads; ++uint threadpool_oversubscribe; ++uint threadpool_toobusy; ++ ++/* Stats */ ++TP_STATISTICS tp_stats; ++ ++/* ++ Worker threads contexts, and THD contexts. ++ ========================================= ++ ++ Both worker threads and connections have their sets of thread local variables ++ At the moment it is mysys_var (this has specific data for dbug, my_error and ++ similar goodies), and PSI per-client structure. ++ ++ Whenever query is executed following needs to be done: ++ ++ 1. Save worker thread context. ++ 2. Change TLS variables to connection specific ones using thread_attach(THD*). ++ This function does some additional work. ++ 3. Process query ++ 4. Restore worker thread context. ++ ++ Connection login and termination follows similar schema w.r.t saving and ++ restoring contexts. ++ ++ For both worker thread, and for the connection, mysys variables are created ++ using my_thread_init() and freed with my_thread_end(). ++ ++*/ ++class Worker_thread_context { ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_thread *const psi_thread; ++#endif ++#ifndef NDEBUG ++ const my_thread_id thread_id; ++#endif ++ public: ++ Worker_thread_context() noexcept ++ : ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ psi_thread(PSI_THREAD_CALL(get_thread)()) ++#endif ++#ifndef NDEBUG ++ , ++ thread_id(my_thread_var_id()) ++#endif ++ { ++ } ++ ++ ~Worker_thread_context() noexcept { ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread)(psi_thread); ++#endif ++#ifndef NDEBUG ++ set_my_thread_var_id(thread_id); ++#endif ++ THR_MALLOC = nullptr; ++ } ++}; ++ ++/* ++ Attach/associate the connection with the OS thread, ++*/ ++bool thread_attach(THD *thd) { ++#ifndef NDEBUG ++ set_my_thread_var_id(thd->thread_id()); ++#endif ++ thd->thread_stack = (char *)&thd; ++ thd->store_globals(); ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread)(thd->get_psi()); ++#endif ++ mysql_socket_set_thread_owner( ++ thd->get_protocol_classic()->get_vio()->mysql_socket); ++ return 0; ++} ++ ++#ifdef HAVE_PSI_STATEMENT_INTERFACE ++extern PSI_statement_info stmt_info_new_packet; ++#endif ++ ++static void threadpool_net_before_header_psi_noop(NET * /* net */, ++ void * /* user_data */, ++ size_t /* count */) {} ++ ++static void threadpool_init_net_server_extension(THD *thd) { ++#ifdef HAVE_PSI_INTERFACE ++ // socket_connection.cc:init_net_server_extension should have been called ++ // already for us. We only need to overwrite the "before" callback ++ assert(thd->m_net_server_extension.m_user_data == thd); ++ thd->m_net_server_extension.m_before_header = ++ threadpool_net_before_header_psi_noop; ++#else ++ assert(thd->get_protocol_classic()->get_net()->extension == NULL); ++#endif ++} ++ ++int threadpool_add_connection(THD *thd) { ++ int retval = 1; ++ Worker_thread_context worker_context; ++ ++ my_thread_init(); ++ ++ /* Create new PSI thread for use with the THD. */ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ thd->set_psi(PSI_THREAD_CALL(new_thread)(key_thread_one_connection, thd, ++ thd->thread_id())); ++#endif ++ ++ /* Login. */ ++ thread_attach(thd); ++ thd->start_utime = my_micro_time(); ++ thd->store_globals(); ++ ++ if (thd_prepare_connection(thd)) { ++ goto end; ++ } ++ ++ /* ++ Check if THD is ok, as prepare_new_connection_state() ++ can fail, for example if init command failed. ++ */ ++ if (thd_connection_alive(thd)) { ++ retval = 0; ++ thd_set_net_read_write(thd, 1); ++ MYSQL_SOCKET_SET_STATE(thd->get_protocol_classic()->get_vio()->mysql_socket, ++ PSI_SOCKET_STATE_IDLE); ++ thd->m_server_idle = true; ++ threadpool_init_net_server_extension(thd); ++ } ++ ++end: ++ if (retval) { ++ Connection_handler_manager *handler_manager = ++ Connection_handler_manager::get_instance(); ++ handler_manager->inc_aborted_connects(); ++ } ++ return retval; ++} ++ ++ ++static Connection_handler_functions tp_chf = { ++ 0, ++ tp_add_connection, ++ tp_end ++}; ++ ++THD_event_functions tp_event_functions = { ++ tp_wait_begin, ++ tp_wait_end, ++ tp_post_kill_notification ++}; ++ ++ ++void threadpool_remove_connection(THD *thd) { ++ Worker_thread_context worker_context; ++ ++ thread_attach(thd); ++ thd_set_net_read_write(thd, 0); ++ ++ end_connection(thd); ++ close_connection(thd, 0); ++ ++ thd->release_resources(); ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(delete_thread)(thd->get_psi()); ++#endif ++ ++ Global_THD_manager::get_instance()->remove_thd(thd); ++ Connection_handler_manager::dec_connection_count(); ++ delete thd; ++} ++ ++/** ++ Process a single client request or a single batch. ++*/ ++int threadpool_process_request(THD *thd) { ++ int retval = 0; ++ Worker_thread_context worker_context; ++ ++ thread_attach(thd); ++ ++ if (thd->killed == THD::KILL_CONNECTION) { ++ /* ++ killed flag was set by timeout handler ++ or KILL command. Return error. ++ */ ++ retval = 1; ++ goto end; ++ } ++ ++ /* ++ In the loop below, the flow is essentially the copy of thead-per-connections ++ logic, see do_handle_one_connection() in sql_connect.c ++ ++ The goal is to execute a single query, thus the loop is normally executed ++ only once. However for SSL connections, it can be executed multiple times ++ (SSL can preread and cache incoming data, and vio->has_data() checks if it ++ was the case). ++ */ ++ for (;;) { ++ Vio *vio; ++ thd_set_net_read_write(thd, 0); ++ ++ if ((retval = do_command(thd)) != 0) goto end; ++ ++ if (!thd_connection_alive(thd)) { ++ retval = 1; ++ goto end; ++ } ++ ++ vio = thd->get_protocol_classic()->get_vio(); ++ if (!vio->has_data(vio)) { ++ /* More info on this debug sync is in sql_parse.cc*/ ++ DEBUG_SYNC(thd, "before_do_command_net_read"); ++ thd_set_net_read_write(thd, 1); ++ goto end; ++ } ++ if (!thd->m_server_idle) { ++ MYSQL_SOCKET_SET_STATE(vio->mysql_socket, PSI_SOCKET_STATE_IDLE); ++ MYSQL_START_IDLE_WAIT(thd->m_idle_psi, &thd->m_idle_state); ++ thd->m_server_idle = true; ++ } ++ } ++ ++end: ++ if (!retval && !thd->m_server_idle) { ++ MYSQL_SOCKET_SET_STATE(thd->get_protocol_classic()->get_vio()->mysql_socket, ++ PSI_SOCKET_STATE_IDLE); ++ MYSQL_START_IDLE_WAIT(thd->m_idle_psi, &thd->m_idle_state); ++ thd->m_server_idle = true; ++ } ++ ++ return retval; ++} ++ ++static void fix_threadpool_size(THD*, ++ struct SYS_VAR *, void*, const void* value) ++{ ++ threadpool_size = *static_cast(value); ++ tp_set_threadpool_size(threadpool_size); ++} ++ ++static void fix_threadpool_stall_limit(THD*, struct SYS_VAR *, void*, const void* value) ++{ ++ threadpool_stall_limit = *static_cast(value); ++ tp_set_threadpool_stall_limit(threadpool_stall_limit); ++} ++ ++static inline int my_getncpus() noexcept { ++#ifdef _SC_NPROCESSORS_ONLN ++ return sysconf(_SC_NPROCESSORS_ONLN); ++#else ++ return 2; /* The value returned by the old my_getncpus implementation */ ++#endif ++} ++ ++static MYSQL_SYSVAR_UINT(idle_timeout, threadpool_idle_timeout, ++ PLUGIN_VAR_RQCMDARG, ++ "Timeout in seconds for an idle thread in the thread pool." ++ "Worker thread will be shut down after timeout", ++ NULL, NULL, 60, 1, UINT_MAX, 1); ++ ++static MYSQL_SYSVAR_UINT(oversubscribe, threadpool_oversubscribe, ++ PLUGIN_VAR_RQCMDARG, ++ "How many additional active worker threads in a group are allowed.", ++ NULL, NULL, 3, 1, 1000, 1); ++ ++static MYSQL_SYSVAR_UINT(toobusy, threadpool_toobusy, ++ PLUGIN_VAR_RQCMDARG, ++ "How many additional active and waiting worker threads in a group are allowed.", ++ NULL, NULL, 13, 1, 1000, 1); ++ ++static MYSQL_SYSVAR_BOOL(dedicated_listener, threadpool_dedicated_listener, ++ PLUGIN_VAR_RQCMDARG, ++ "Control whether listener be dedicated", nullptr, ++ nullptr, false); ++ ++static MYSQL_SYSVAR_UINT(size, threadpool_size, ++ PLUGIN_VAR_RQCMDARG, ++ "Number of thread groups in the pool. " ++ "This parameter is roughly equivalent to maximum number of concurrently " ++ "executing threads (threads in a waiting state do not count as executing).", ++ NULL, fix_threadpool_size, (uint)my_getncpus(), 1, MAX_THREAD_GROUPS, 1); ++ ++static MYSQL_SYSVAR_BOOL(sched_affinity, threadpool_sched_affinity, ++ PLUGIN_VAR_RQCMDARG, ++ "Control whether thread group scheduling affinity.", nullptr, ++ nullptr, false); ++ ++static MYSQL_SYSVAR_UINT(stall_limit, threadpool_stall_limit, ++ PLUGIN_VAR_RQCMDARG, ++ "Maximum query execution time in milliseconds," ++ "before an executing non-yielding thread is considered stalled." ++ "If a worker thread is stalled, additional worker thread " ++ "may be created to handle remaining clients.", ++ NULL, fix_threadpool_stall_limit, 500, 10, UINT_MAX, 1); ++ ++static MYSQL_SYSVAR_UINT(max_threads, threadpool_max_threads, ++ PLUGIN_VAR_RQCMDARG, ++ "Maximum allowed number of worker threads in the thread pool", ++ NULL, NULL, MAX_CONNECTIONS, 1, MAX_CONNECTIONS, 1); ++ ++static int threadpool_plugin_init(void *) ++{ ++ DBUG_ENTER("threadpool_plugin_init"); ++ ++ tp_init(); ++ my_connection_handler_set(&tp_chf, &tp_event_functions); ++ DBUG_RETURN(0); ++} ++ ++static int threadpool_plugin_deinit(void *) ++{ ++ DBUG_ENTER("threadpool_plugin_deinit"); ++ my_connection_handler_reset(); ++ DBUG_RETURN(0); ++} ++ ++static MYSQL_THDVAR_UINT(high_prio_tickets, ++ PLUGIN_VAR_RQCMDARG, ++ "Number of tickets to enter the high priority event queue for each " ++ "transaction.", ++ NULL, NULL, UINT_MAX, 0, UINT_MAX, 1); ++ ++const char *threadpool_high_prio_mode_names[] = {"transactions", "statements", ++ "none", NullS}; ++TYPELIB threadpool_high_prio_mode_typelib = { ++ array_elements(threadpool_high_prio_mode_names) - 1, "", ++ threadpool_high_prio_mode_names, NULL ++}; ++ ++static MYSQL_THDVAR_ENUM(high_prio_mode, ++ PLUGIN_VAR_RQCMDARG, ++ "High priority queue mode: one of 'transactions', 'statements' or 'none'. " ++ "In the 'transactions' mode the thread pool uses both high- and low-priority " ++ "queues depending on whether an event is generated by an already started " ++ "transaction and whether it has any high priority tickets (see " ++ "thread_pool_high_prio_tickets). In the 'statements' mode all events (i.e. " ++ "individual statements) always go to the high priority queue, regardless of " ++ "the current transaction state and high priority tickets. " ++ "'none' is the opposite of 'statements', i.e. disables the high priority queue " ++ "completely.", ++ NULL, NULL, TP_HIGH_PRIO_MODE_TRANSACTIONS, &threadpool_high_prio_mode_typelib); ++ ++static uint &idle_timeout = threadpool_idle_timeout; ++static bool &dedicated_listener = threadpool_dedicated_listener; ++static uint &size = threadpool_size; ++static bool &sched_affinity = threadpool_sched_affinity; ++static uint &stall_limit = threadpool_stall_limit; ++static uint &max_threads = threadpool_max_threads; ++static uint &oversubscribe = threadpool_oversubscribe; ++static uint &toobusy = threadpool_toobusy; ++ ++SYS_VAR *system_variables[] = { ++ MYSQL_SYSVAR(idle_timeout), ++ MYSQL_SYSVAR(dedicated_listener), ++ MYSQL_SYSVAR(size), ++ MYSQL_SYSVAR(sched_affinity), ++ MYSQL_SYSVAR(max_threads), ++ MYSQL_SYSVAR(stall_limit), ++ MYSQL_SYSVAR(oversubscribe), ++ MYSQL_SYSVAR(toobusy), ++ MYSQL_SYSVAR(high_prio_tickets), ++ MYSQL_SYSVAR(high_prio_mode), ++ NULL ++}; ++ ++namespace Show { ++ ++static ST_FIELD_INFO groups_fields_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"CONNECTIONS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"ACTIVE_THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"STANDBY_THREADS", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"QUEUE_LENGTH", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"HAS_LISTENER", 1, MYSQL_TYPE_TINY, 0, 0, 0, 0}, ++ {"IS_STALLED", 1, MYSQL_TYPE_TINY, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++ ++static int groups_fill_table(THD* thd, TABLE_LIST* tables, Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint i = 0; i < MAX_THREAD_GROUPS && all_groups[i].pollfd != -1; i++) ++ { ++ thread_group_t* group = &all_groups[i]; ++ ++ mysql_mutex_lock(&group->mutex); ++ ++ /* ID */ ++ table->field[0]->store(i, true); ++ /* CONNECTION_COUNT */ ++ table->field[1]->store(group->connection_count, true); ++ /* THREAD_COUNT */ ++ table->field[2]->store(group->thread_count, true); ++ /* ACTIVE_THREAD_COUNT */ ++ table->field[3]->store(group->active_thread_count, true); ++ /* STANDBY_THREAD_COUNT */ ++ table->field[4]->store(group->waiting_thread_count, true); ++ /* QUEUE LENGTH */ ++ uint queue_len = group->high_prio_queue.elements() ++ + group->queue.elements(); ++ table->field[5]->store(queue_len, true); ++ /* HAS_LISTENER */ ++ table->field[6]->store((longlong)(group->listener != 0), true); ++ /* IS_STALLED */ ++ table->field[7]->store(group->stalled, true); ++ ++ mysql_mutex_unlock(&group->mutex); ++ ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++ ++static int groups_init(void* p) ++{ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::groups_fields_info; ++ schema->fill_table = groups_fill_table; ++ return 0; ++} ++ ++ ++namespace Show { ++ ++static ST_FIELD_INFO queues_field_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"POSITION", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"PRIORITY", 1, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"CONNECTION_ID", 19, MYSQL_TYPE_LONGLONG, 0, MY_I_S_UNSIGNED, 0, 0}, ++ {"QUEUEING_TIME_MICROSECONDS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++typedef connection_queue_t::Iterator connection_queue_iterator; ++ ++static int queues_fill_table(THD* thd, TABLE_LIST* tables, Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint group_id = 0; ++ group_id < MAX_THREAD_GROUPS && all_groups[group_id].pollfd != -1; ++ group_id++) ++ { ++ thread_group_t* group = &all_groups[group_id]; ++ ++ mysql_mutex_lock(&group->mutex); ++ bool err = false; ++ int pos = 0; ++ ulonglong now = my_microsecond_getsystime(); ++ connection_queue_t queues[NQUEUES] = {group->high_prio_queue, group->queue}; ++ for (uint prio = 0; prio < NQUEUES && !err; prio++) ++ { ++ connection_queue_iterator it(queues[prio]); ++ connection_t* c; ++ while ((c = it++) != nullptr) ++ { ++ /* GROUP_ID */ ++ table->field[0]->store(group_id, true); ++ /* POSITION */ ++ table->field[1]->store(pos++, true); ++ /* PRIORITY */ ++ table->field[2]->store(prio, true); ++ /* CONNECTION_ID */ ++ if (c->thd != nullptr) { ++ table->field[3]->store(c->thd->thread_id(), true); ++ } else { ++ table->field[3]->store(0, true); ++ } ++ /* QUEUEING_TIME */ ++ table->field[4]->store(now - c->enqueue_time, true); ++ ++ err = schema_table_store_record(thd, table); ++ if (err) ++ break; ++ } ++ } ++ mysql_mutex_unlock(&group->mutex); ++ if (err) ++ return 1; ++ } ++ return 0; ++} ++ ++static int queues_init(void* p) ++{ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::queues_field_info; ++ schema->fill_table = queues_fill_table; ++ return 0; ++} ++ ++namespace Show { ++ ++static ST_FIELD_INFO stats_fields_info[] = ++{ ++ {"GROUP_ID", 6, MYSQL_TYPE_LONG, 0, 0, 0, 0}, ++ {"THREAD_CREATIONS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"THREAD_CREATIONS_DUE_TO_STALL", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"WAKES", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"WAKES_DUE_TO_STALL", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"THROTTLES", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"STALLS", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"POLLS_BY_LISTENER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"POLLS_BY_WORKER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"DEQUEUES_BY_LISTENER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {"DEQUEUES_BY_WORKER", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++ ++static int stats_fill_table(THD* thd, TABLE_LIST* tables, Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (uint i = 0; i < MAX_THREAD_GROUPS && all_groups[i].pollfd != -1; i++) ++ { ++ table->field[0]->store(i, true); ++ thread_group_t* group = &all_groups[i]; ++ ++ mysql_mutex_lock(&group->mutex); ++ thread_group_counters_t* counters = &group->counters; ++ table->field[1]->store(counters->thread_creations, true); ++ table->field[2]->store(counters->thread_creations_due_to_stall, true); ++ table->field[3]->store(counters->wakes, true); ++ table->field[4]->store(counters->wakes_due_to_stall, true); ++ table->field[5]->store(counters->throttles, true); ++ table->field[6]->store(counters->stalls, true); ++ table->field[7]->store(counters->polls[LISTENER], true); ++ table->field[8]->store(counters->polls[WORKER], true); ++ table->field[9]->store(counters->dequeues[LISTENER], true); ++ table->field[10]->store(counters->dequeues[WORKER], true); ++ mysql_mutex_unlock(&group->mutex); ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++static int stats_init(void* p) ++{ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::stats_fields_info; ++ schema->fill_table = stats_fill_table; ++ return 0; ++} ++ ++ ++namespace Show { ++ ++static ST_FIELD_INFO waits_fields_info[] = ++{ ++ {"REASON", 16, MYSQL_TYPE_STRING, 0, 0, 0, 0}, ++ {"COUNT", 19, MYSQL_TYPE_LONGLONG, 0, 0, 0, 0}, ++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0, 0} ++}; ++ ++} // namespace Show ++ ++/* See thd_wait_type enum for explanation*/ ++static const LEX_CSTRING wait_reasons[THD_WAIT_LAST] = ++{ ++ {STRING_WITH_LEN("UNKNOWN")}, ++ {STRING_WITH_LEN("SLEEP")}, ++ {STRING_WITH_LEN("DISKIO")}, ++ {STRING_WITH_LEN("ROW_LOCK")}, ++ {STRING_WITH_LEN("GLOBAL_LOCK")}, ++ {STRING_WITH_LEN("META_DATA_LOCK")}, ++ {STRING_WITH_LEN("TABLE_LOCK")}, ++ {STRING_WITH_LEN("USER_LOCK")}, ++ {STRING_WITH_LEN("BINLOG")}, ++ {STRING_WITH_LEN("GROUP_COMMIT")}, ++ {STRING_WITH_LEN("SYNC")} ++}; ++ ++extern std::atomic tp_waits[THD_WAIT_LAST]; ++ ++static int waits_fill_table(THD* thd, TABLE_LIST* tables, Item*) ++{ ++ if (!all_groups) ++ return 0; ++ ++ TABLE* table = tables->table; ++ for (unsigned int i = 0; i < THD_WAIT_LAST; i++) ++ { ++ table->field[0]->store(wait_reasons[i].str, wait_reasons[i].length, system_charset_info); ++ table->field[1]->store(tp_waits[i], true); ++ if (schema_table_store_record(thd, table)) ++ return 1; ++ } ++ return 0; ++} ++ ++static int waits_init(void* p) ++{ ++ ST_SCHEMA_TABLE* schema = (ST_SCHEMA_TABLE*)p; ++ schema->fields_info = Show::waits_fields_info; ++ schema->fill_table = waits_fill_table; ++ return 0; ++} ++ ++struct st_mysql_daemon thread_pool_plugin = ++{ MYSQL_DAEMON_INTERFACE_VERSION }; ++ ++static struct st_mysql_information_schema plugin_descriptor = ++{ MYSQL_INFORMATION_SCHEMA_INTERFACE_VERSION }; ++ ++mysql_declare_plugin(thread_pool) ++{ ++ MYSQL_DAEMON_PLUGIN, ++ &thread_pool_plugin, ++ "thread_pool", ++ "TEST_TEST", ++ "thread pool plugin extracted from percona server", ++ PLUGIN_LICENSE_GPL, ++ threadpool_plugin_init, /* Plugin Init */ ++ nullptr, /* Plugin Check uninstall */ ++ threadpool_plugin_deinit, /* Plugin Deinit */ ++ 0x0100 /* 1.0 */, ++ nullptr, /* status variables */ ++ system_variables, /* system variables */ ++ nullptr, /* config options */ ++ 0, /* flags */ ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_GROUPS", ++ "Vladislav Vaintroub", ++ "Provides information about threadpool groups.", ++ PLUGIN_LICENSE_GPL, ++ groups_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_QUEUES", ++ "Vladislav Vaintroub", ++ "Provides information about threadpool queues.", ++ PLUGIN_LICENSE_GPL, ++ queues_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_STATS", ++ "Vladislav Vaintroub", ++ "Provides performance counter information for threadpool.", ++ PLUGIN_LICENSE_GPL, ++ stats_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++}, ++{ ++ MYSQL_INFORMATION_SCHEMA_PLUGIN, ++ &plugin_descriptor, ++ "THREAD_POOL_WAITS", ++ "Vladislav Vaintroub", ++ "Provides wait counters for threadpool.", ++ PLUGIN_LICENSE_GPL, ++ waits_init, ++ nullptr, ++ nullptr, ++ 0x0100, ++ nullptr, ++ nullptr, ++ nullptr, ++ 0, ++} ++mysql_declare_plugin_end; ++ ++uint tp_get_thdvar_high_prio_tickets(THD *thd) { ++ return THDVAR(thd, high_prio_tickets); ++} ++ ++uint tp_get_thdvar_high_prio_mode(THD *thd) { ++ return THDVAR(thd, high_prio_mode); ++} ++ +diff --git a/plugin/thread_pool/threadpool_unix.cc b/plugin/thread_pool/threadpool_unix.cc +new file mode 100644 +index 00000000000..a9fdf3dbfcd +--- /dev/null ++++ b/plugin/thread_pool/threadpool_unix.cc +@@ -0,0 +1,1794 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ ++#include "threadpool_unix.h" ++#include "sql/debug_sync.h" ++#include "sql/log.h" ++#include "sql/protocol_classic.h" ++#include "my_sys.h" ++#include "my_systime.h" ++#include "mysql/thread_pool_priv.h" // thd_is_transaction_active() ++#include "mysql/plugin.h" ++#include "threadpool.h" ++#include ++#include ++ ++#define MYSQL_SERVER 1 ++ ++/** Maximum number of native events a listener can read in one go */ ++#define MAX_EVENTS 1024 ++ ++/** Define if wait_begin() should create threads if necessary without waiting ++for stall detection to kick in */ ++#define THREADPOOL_CREATE_THREADS_ON_WAIT ++ ++/** Indicates that threadpool was initialized*/ ++static bool threadpool_started = false; ++ ++/* ++ Define PSI Keys for performance schema. ++ We have a mutex per group, worker threads, condition per worker thread, ++ and timer thread with its own mutex and condition. ++*/ ++ ++#ifdef HAVE_PSI_INTERFACE ++static PSI_mutex_key key_group_mutex; ++static PSI_mutex_key key_timer_mutex; ++static PSI_mutex_info mutex_list[] = { ++ {&key_group_mutex, "group_mutex", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_mutex, "timer_mutex", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; ++ ++static PSI_cond_key key_worker_cond; ++static PSI_cond_key key_timer_cond; ++static PSI_cond_info cond_list[] = { ++ {&key_worker_cond, "worker_cond", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_cond, "timer_cond", PSI_FLAG_SINGLETON, 0, PSI_DOCUMENT_ME}}; ++ ++static PSI_thread_key key_worker_thread; ++static PSI_thread_key key_timer_thread; ++static PSI_thread_info thread_list[] = { ++ {&key_worker_thread, "worker_thread", 0, 0, PSI_DOCUMENT_ME}, ++ {&key_timer_thread, "timer_thread", PSI_FLAG_SINGLETON, 0, ++ PSI_DOCUMENT_ME}}; ++#endif // HAVE_PSI_INTERFACE ++ ++thread_group_t all_groups[MAX_THREAD_GROUPS]; ++numa_affinity_manager group_affinity; ++ ++static uint group_count; ++ ++/** ++ Used for printing "pool blocked" message, see ++ print_pool_blocked_message(); ++*/ ++static ulonglong pool_block_start; ++ ++/* Global timer for all groups */ ++struct pool_timer_t { ++ mysql_mutex_t mutex; ++ mysql_cond_t cond; ++ std::atomic current_microtime; ++ std::atomic next_timeout_check; ++ int tick_interval; ++ bool shutdown; ++}; ++ ++static pool_timer_t pool_timer; ++ ++static void queue_put(thread_group_t *thread_group, connection_t *connection); ++static int wake_thread(thread_group_t *thread_group, ++ bool due_to_stall) noexcept; ++static void handle_event(connection_t *connection); ++static int wake_or_create_thread(thread_group_t *thread_group, ++ bool due_to_stall = false); ++static int create_worker(thread_group_t *thread_group, bool due_to_stall) noexcept; ++static void *admin_port_worker_main(void *param); ++static void *worker_main(void *param); ++static void *connection_detach_worker(void *param); ++static void check_stall(thread_group_t *thread_group); ++static void connection_abort(connection_t *connection); ++static void set_next_timeout_check(ulonglong abstime); ++static void print_pool_blocked_message(bool) noexcept; ++ ++THD *thd_to_detach = nullptr; ++ ++class ThreadPoolConnSet { ++public: ++ ThreadPoolConnSet() {}; ++ virtual ~ThreadPoolConnSet() {}; ++ ++ bool empty() { ++ bool ret = false; ++ mtx.lock(); ++ ret = conns.empty(); ++ mtx.unlock(); ++ return ret; ++ } ++ ++ void killConns() { ++ mtx.lock(); ++ for (auto &it: conns) { ++ THD *thd = it->thd; ++ if (current_thd != thd && thd->killed != THD::KILL_CONNECTION) { ++ mysql_mutex_lock(&thd->LOCK_thd_data); ++ thd->killed = THD::KILL_CONNECTION; ++ tp_post_kill_notification(thd); ++ mysql_mutex_unlock(&thd->LOCK_thd_data); ++ } else if (current_thd == thd) { ++ thd_to_detach = thd; ++ } ++ } ++ mtx.unlock(); ++ } ++ ++ void insert(connection_t *c) { ++ mtx.lock(); ++ conns.insert(c); ++ mtx.unlock(); ++ } ++ ++ void erase(connection_t *c) { ++ mtx.lock(); ++ conns.erase(c); ++ mtx.unlock(); ++ } ++ ++public: ++ std::set conns; ++ std::mutex mtx; ++}; ++ ++ThreadPoolConnSet threadpool_thds; ++ ++int vio_cancel(Vio *vio, int how) ++{ ++ int r= 0; ++ DBUG_ENTER("vio_cancel"); ++ ++ if (vio->inactive == false) ++ { ++ assert(vio->type == VIO_TYPE_TCPIP || ++ vio->type == VIO_TYPE_SOCKET || ++ vio->type == VIO_TYPE_SSL); ++ ++ assert(mysql_socket_getfd(vio->mysql_socket) >= 0); ++ if (mysql_socket_shutdown(vio->mysql_socket, how)) ++ r= -1; ++ } ++ ++ DBUG_RETURN(r); ++} ++ ++/** ++ Asynchronous network IO. ++ ++ We use native edge-triggered network IO multiplexing facility. ++ This maps to different APIs on different Unixes. ++ ++ Supported are currently Linux with epoll, Solaris with event ports, ++ OSX and BSD with kevent. All those API's are used with one-shot flags ++ (the event is signalled once client has written something into the socket, ++ then socket is removed from the "poll-set" until the command is finished, ++ and we need to re-arm/re-register socket) ++ ++ No implementation for poll/select/AIO is currently provided. ++ ++ The API closely resembles all of the above mentioned platform APIs ++ and consists of following functions. ++ ++ - io_poll_create() ++ Creates an io_poll descriptor ++ On Linux: epoll_create() ++ ++ - io_poll_associate_fd(int poll_fd, int fd, void *data) ++ Associate file descriptor with io poll descriptor ++ On Linux : epoll_ctl(..EPOLL_CTL_ADD)) ++ ++ - io_poll_disassociate_fd(int pollfd, int fd) ++ Associate file descriptor with io poll descriptor ++ On Linux: epoll_ctl(..EPOLL_CTL_DEL) ++ ++ ++ - io_poll_start_read(int poll_fd,int fd, void *data) ++ The same as io_poll_associate_fd(), but cannot be used before ++ io_poll_associate_fd() was called. ++ On Linux : epoll_ctl(..EPOLL_CTL_MOD) ++ ++ - io_poll_wait (int pollfd, native_event *native_events, int maxevents, ++ int timeout_ms) ++ ++ wait until one or more descriptors added with io_poll_associate_fd() ++ or io_poll_start_read() becomes readable. Data associated with ++ descriptors can be retrieved from native_events array, using ++ native_event_get_userdata() function. ++ ++ ++ On Linux: epoll_wait() ++*/ ++ ++#if defined(__linux__) ++#ifndef EPOLLRDHUP ++/* Early 2.6 kernel did not have EPOLLRDHUP */ ++#define EPOLLRDHUP 0 ++#endif ++static int io_poll_create() noexcept { return epoll_create(1); } ++ ++static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { ++ struct epoll_event ev; ++ ev.data.u64 = 0; /* Keep valgrind happy */ ++ ev.data.ptr = data; ++ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; ++ return epoll_ctl(pollfd, EPOLL_CTL_ADD, fd, &ev); ++} ++ ++static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { ++ struct epoll_event ev; ++ ev.data.u64 = 0; /* Keep valgrind happy */ ++ ev.data.ptr = data; ++ ev.events = EPOLLIN | EPOLLET | EPOLLERR | EPOLLRDHUP | EPOLLONESHOT; ++ return epoll_ctl(pollfd, EPOLL_CTL_MOD, fd, &ev); ++} ++ ++static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { ++ struct epoll_event ev; ++ return epoll_ctl(pollfd, EPOLL_CTL_DEL, fd, &ev); ++} ++ ++/* ++ Wrapper around epoll_wait. ++ NOTE - in case of EINTR, it restarts with original timeout. Since we use ++ either infinite or 0 timeouts, this is not critical ++*/ ++static int io_poll_wait(int pollfd, native_event *native_events, int maxevents, ++ int timeout_ms) noexcept { ++ int ret; ++ do { ++ ret = epoll_wait(pollfd, native_events, maxevents, timeout_ms); ++ } while (ret == -1 && errno == EINTR); ++ return ret; ++} ++ ++static void *native_event_get_userdata(native_event *event) noexcept { ++ return event->data.ptr; ++} ++ ++#elif defined(__FreeBSD__) || defined(__APPLE__) ++static int io_poll_create() noexcept { return kqueue(); } ++ ++static int io_poll_start_read(int pollfd, int fd, void *data) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); ++ return kevent(pollfd, &ke, 1, 0, 0, 0); ++} ++ ++static int io_poll_associate_fd(int pollfd, int fd, void *data) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_ADD | EV_ONESHOT, 0, 0, data); ++ return io_poll_start_read(pollfd, fd, data); ++} ++ ++static int io_poll_disassociate_fd(int pollfd, int fd) noexcept { ++ struct kevent ke; ++ EV_SET(&ke, fd, EVFILT_READ, EV_DELETE, 0, 0, nullptr); ++ return kevent(pollfd, &ke, 1, 0, 0, 0); ++} ++ ++static int io_poll_wait(int pollfd, struct kevent *events, int maxevents, ++ int timeout_ms) noexcept { ++ struct timespec ts; ++ int ret; ++ if (timeout_ms >= 0) { ++ ts.tv_sec = timeout_ms / 1000; ++ ts.tv_nsec = (timeout_ms % 1000) * 1000000; ++ } ++ do { ++ ret = kevent(pollfd, 0, 0, events, maxevents, ++ (timeout_ms >= 0) ? &ts : nullptr); ++ } while (ret == -1 && errno == EINTR); ++ return ret; ++} ++ ++static void *native_event_get_userdata(native_event *event) noexcept { ++ return event->udata; ++} ++#else ++#error not ported yet to this OS ++#endif ++ ++namespace { ++ ++/* ++ Prevent too many active threads executing at the same time, if the workload is ++ not CPU bound. ++*/ ++inline bool too_many_active_threads( ++ const thread_group_t &thread_group) noexcept { ++ return (thread_group.active_thread_count >= ++ 1 + (int)threadpool_oversubscribe && ++ !thread_group.stalled); ++} ++ ++/* ++ Limit the number of 'busy' threads by 1 + threadpool_toobusy. A thread ++ is busy if it is in either the active state or the waiting state (i.e. between ++ thd_wait_begin() / thd_wait_end() calls). ++*/ ++inline bool too_many_busy_threads(const thread_group_t &thread_group) noexcept { ++ return (thread_group.active_thread_count + thread_group.waiting_thread_count > ++ 1 + (int)threadpool_toobusy); ++} ++ ++inline bool too_many_connection(const thread_group_t &thread_group) noexcept { ++ return (thread_group.connection_count > (int)threadpool_toobusy - 1); ++} ++ ++/* ++ Checks if a given connection is eligible to enter the high priority queue ++ based on its current thread_pool_high_prio_mode value, available high ++ priority tickets and transactional state and whether any locks are held. ++*/ ++inline bool connection_is_high_prio(const connection_t &c) noexcept { ++ const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); ++ ++ return (mode == TP_HIGH_PRIO_MODE_STATEMENTS) || ++ (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && ++ (thd_is_transaction_active(c.thd) || ++ c.thd->variables.option_bits & OPTION_TABLE_LOCK || ++ c.thd->locked_tables_mode != LTM_NONE || ++ c.thd->mdl_context.has_locks() || ++ c.thd->global_read_lock.is_acquired() || ++ c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || ++ c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); ++} ++ ++inline bool connection_is_worker_continue(const connection_t &c) noexcept { ++ if (c.thd->is_admin_connection()) { ++ return true; ++ } ++ ++ if (c.thread_group != &all_groups[c.thd->thread_id() % group_count]) { ++ return false; ++ } ++ ++ if (!too_many_connection(*(c.thread_group))) { ++ return true; ++ } ++ ++ const ulong mode = tp_get_thdvar_high_prio_mode(c.thd); ++ bool ret = (mode == TP_HIGH_PRIO_MODE_TRANSACTIONS && c.tickets > 0 && ++ (thd_is_transaction_active(c.thd) || ++ c.thd->variables.option_bits & OPTION_TABLE_LOCK || ++ c.thd->locked_tables_mode != LTM_NONE || ++ c.thd->mdl_context.has_locks() || ++ c.thd->global_read_lock.is_acquired() || ++ c.thd->mdl_context.has_locks(MDL_key::USER_LEVEL_LOCK) || ++ c.thd->mdl_context.has_locks(MDL_key::LOCKING_SERVICE))); ++ return ret; ++} ++ ++} // namespace ++ ++/* Dequeue element from a workqueue */ ++static connection_t *queue_get(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("queue_get"); ++ thread_group->queue_event_count++; ++ connection_t *c; ++ ++ if ((c = thread_group->high_prio_queue.front())) { ++ thread_group->high_prio_queue.remove(c); ++ } ++ /* ++ Don't pick events from the low priority queue if there are too many ++ active + waiting threads. ++ */ ++ else if (!too_many_busy_threads(*thread_group) && ++ (c = thread_group->queue.front())) { ++ thread_group->queue.remove(c); ++ } ++ DBUG_RETURN(c); ++} ++ ++static connection_t *queue_get(thread_group_t *group, operation_origin origin) { ++ connection_t *ret = queue_get(group); ++ if (ret != nullptr) { ++ TP_INCREMENT_GROUP_COUNTER(group, dequeues[(int)origin]); ++ } ++ return ret; ++} ++ ++static inline void queue_push(thread_group_t *thread_group, connection_t *connection) ++{ ++ connection->enqueue_time= pool_timer.current_microtime; ++ thread_group->queue.push_back(connection); ++} ++ ++static inline void high_prio_queue_push(thread_group_t *thread_group, connection_t *connection) ++{ ++ connection->enqueue_time= pool_timer.current_microtime; ++ thread_group->high_prio_queue.push_back(connection); ++} ++ ++class Thd_timeout_checker : public Do_THD_Impl { ++ private: ++ pool_timer_t *const m_timer; ++ ++ public: ++ Thd_timeout_checker(pool_timer_t *timer) noexcept : m_timer(timer) {} ++ ++ virtual ~Thd_timeout_checker() {} ++ ++ virtual void operator()(THD *thd) noexcept { ++ if (thd_get_net_read_write(thd) != 1) return; ++ ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ if (!connection) return; ++ ++ if (connection->abs_wait_timeout < ++ m_timer->current_microtime.load(std::memory_order_relaxed)) { ++ /* Wait timeout exceeded, kill connection. */ ++ mysql_mutex_lock(&thd->LOCK_thd_data); ++ thd->killed = THD::KILL_CONNECTION; ++ tp_post_kill_notification(thd); ++ mysql_mutex_unlock(&thd->LOCK_thd_data); ++ } else { ++ set_next_timeout_check(connection->abs_wait_timeout); ++ } ++ } ++}; ++ ++/* ++ Handle wait timeout : ++ Find connections that have been idle for too long and kill them. ++ Also, recalculate time when next timeout check should run. ++*/ ++static void timeout_check(pool_timer_t *timer) { ++ DBUG_ENTER("timeout_check"); ++ ++ /* Reset next timeout check, it will be recalculated in the loop below */ ++ timer->next_timeout_check.store(ULLONG_MAX, std::memory_order_relaxed); ++ ++ Thd_timeout_checker thd_timeout_checker(timer); ++ Global_THD_manager::get_instance()->do_for_all_thd_copy(&thd_timeout_checker); ++ ++ DBUG_VOID_RETURN; ++} ++ ++/* ++ Timer thread. ++ ++ Periodically, check if one of the thread groups is stalled. Stalls happen if ++ events are not being dequeued from the queue, or from the network, Primary ++ reason for stall can be a lengthy executing non-blocking request. It could ++ also happen that thread is waiting but wait_begin/wait_end is forgotten by ++ storage engine. Timer thread will create a new thread in group in case of ++ a stall. ++ ++ Besides checking for stalls, timer thread is also responsible for terminating ++ clients that have been idle for longer than wait_timeout seconds. ++ ++ TODO: Let the timer sleep for long time if there is no work to be done. ++ Currently it wakes up rather often on and idle server. ++*/ ++static void *timer_thread(void *param) noexcept { ++ my_thread_init(); ++ DBUG_ENTER("timer_thread"); ++ ++ pool_timer_t *timer = (pool_timer_t *)param; ++ timer->next_timeout_check.store(ULLONG_MAX, std::memory_order_relaxed); ++ timer->current_microtime.store(my_microsecond_getsystime(), ++ std::memory_order_relaxed); ++ ++ for (;;) { ++ struct timespec ts; ++ ++ set_timespec_nsec(&ts, timer->tick_interval * 1000000ULL); ++ mysql_mutex_lock(&timer->mutex); ++ int err = mysql_cond_timedwait(&timer->cond, &timer->mutex, &ts); ++ if (timer->shutdown) { ++ mysql_mutex_unlock(&timer->mutex); ++ break; ++ } ++ if (err == ETIMEDOUT) { ++ timer->current_microtime.store(my_microsecond_getsystime(), ++ std::memory_order_relaxed); ++ ++ /* Check stalls in thread groups */ ++ for (size_t i = 0; i < array_elements(all_groups); i++) { ++ if (all_groups[i].connection_count) check_stall(&all_groups[i]); ++ } ++ ++ /* Check if any client exceeded wait_timeout */ ++ if (timer->next_timeout_check.load(std::memory_order_relaxed) <= ++ timer->current_microtime.load(std::memory_order_relaxed)) ++ timeout_check(timer); ++ } ++ mysql_mutex_unlock(&timer->mutex); ++ } ++ ++ mysql_mutex_destroy(&timer->mutex); ++ my_thread_end(); ++ return nullptr; ++} ++ ++/* ++ Check if both the high and low priority queues are empty. ++ ++ NOTE: we also consider the low priority queue empty in case it has events, but ++ they cannot be processed due to the too_many_busy_threads() limit. ++*/ ++static bool queues_are_empty(const thread_group_t &tg) noexcept { ++ return (tg.high_prio_queue.is_empty() && ++ (tg.queue.is_empty() || too_many_busy_threads(tg))); ++} ++ ++static void check_stall(thread_group_t *thread_group) { ++ if (mysql_mutex_trylock(&thread_group->mutex) != 0) { ++ /* Something happens. Don't disturb */ ++ return; ++ } ++ ++ /* ++ Check if listener is present. If not, check whether any IO ++ events were dequeued since last time. If not, this means ++ listener is either in tight loop or thd_wait_begin() ++ was forgotten. Create a new worker(it will make itself listener). ++ */ ++ if (!thread_group->listener && !thread_group->io_event_count) { ++ wake_or_create_thread(thread_group, true); ++ mysql_mutex_unlock(&thread_group->mutex); ++ return; ++ } ++ ++ /* Reset io event count */ ++ thread_group->io_event_count = 0; ++ ++ /* ++ Check whether requests from the workqueues are being dequeued. ++ ++ The stall detection and resolution works as follows: ++ ++ 1. There is a counter thread_group->queue_event_count for the number of ++ events removed from the queues. Timer resets the counter to 0 on each ++ run. ++ 2. Timer determines stall if this counter remains 0 since last check ++ and at least one of the high and low priority queues is not empty. ++ 3. Once timer determined a stall it sets thread_group->stalled flag and ++ wakes and idle worker (or creates a new one, subject to throttling). ++ 4. The stalled flag is reset, when an event is dequeued. ++ ++ Q : Will this handling lead to an unbound growth of threads, if queues ++ stall permanently? ++ A : No. If queues stall permanently, it is an indication for many very long ++ simultaneous queries. The maximum number of simultanoues queries is ++ max_connections, further we have threadpool_max_threads limit, upon which no ++ worker threads are created. So in case there is a flood of very long ++ queries, threadpool would slowly approach thread-per-connection behavior. ++ NOTE: ++ If long queries never wait, creation of the new threads is done by timer, ++ so it is slower than in real thread-per-connection. However if long queries ++ do wait and indicate that via thd_wait_begin/end callbacks, thread creation ++ will be faster. ++ */ ++ if (!thread_group->queue_event_count && !queues_are_empty(*thread_group)) { ++ thread_group->stalled = true; ++ TP_INCREMENT_GROUP_COUNTER(thread_group, stalls); ++ wake_or_create_thread(thread_group, true); ++ } ++ ++ /* Reset queue event count */ ++ thread_group->queue_event_count = 0; ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++} ++ ++static void start_timer(pool_timer_t *timer) noexcept { ++ my_thread_handle thread_id; ++ DBUG_ENTER("start_timer"); ++ mysql_mutex_init(key_timer_mutex, &timer->mutex, nullptr); ++ mysql_cond_init(key_timer_cond, &timer->cond); ++ timer->shutdown = false; ++ mysql_thread_create(key_timer_thread, &thread_id, nullptr, timer_thread, timer); ++ DBUG_VOID_RETURN; ++} ++ ++static void stop_timer(pool_timer_t *timer) noexcept { ++ DBUG_ENTER("stop_timer"); ++ mysql_mutex_lock(&timer->mutex); ++ timer->shutdown = true; ++ mysql_cond_signal(&timer->cond); ++ mysql_mutex_unlock(&timer->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Poll for socket events and distribute them to worker threads ++ In many case current thread will handle single event itself. ++ ++ @return a ready connection, or NULL on shutdown ++*/ ++static connection_t *listener(thread_group_t *thread_group) { ++ DBUG_ENTER("listener"); ++ connection_t *retval = nullptr; ++ ++ for (;;) { ++ if (thread_group->shutdown) break; ++ ++ native_event ev[MAX_EVENTS]; ++ int cnt = io_poll_wait(thread_group->pollfd, ev, MAX_EVENTS, -1); ++ ++ DBUG_EXECUTE_IF("threadpool_io_poll_wait_at_least_2_events", ++ { ++ while (cnt < 2) ++ { ++ int cnt_again = io_poll_wait(thread_group->pollfd, ev + cnt, MAX_EVENTS - cnt, -1); ++ cnt += cnt_again; ++ } ++ } ++ ); ++ ++ TP_INCREMENT_GROUP_COUNTER(thread_group, polls[LISTENER]); ++ if (cnt <= 0) { ++ assert(thread_group->shutdown); ++ break; ++ } ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ ++ if (thread_group->shutdown) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ break; ++ } ++ ++ thread_group->io_event_count += cnt; ++ ++ /* ++ We got some network events and need to make decisions : whether ++ listener hould handle events and whether or not any wake worker ++ threads so they can handle events. ++ ++ Q1 : Should listener handle an event itself, or put all events into ++ queue and let workers handle the events? ++ ++ Solution : ++ Generally, listener that handles events itself is preferable. We do not ++ want listener thread to change its state from waiting to running too ++ often, Since listener has just woken from poll, it better uses its time ++ slice and does some work. Besides, not handling events means they go to ++ the queue, and often to wake another worker must wake up to handle the ++ event. This is not good, as we want to avoid wakeups. ++ ++ The downside of listener that also handles queries is that we can ++ potentially leave thread group for long time not picking the new ++ network events. It is not a major problem, because this stall will be ++ detected sooner or later by the timer thread. Still, relying on timer ++ is not always good, because it may "tick" too slow (large timer_interval) ++ ++ We use following strategy to solve this problem - if queue was not empty ++ we suspect flood of network events and listener stays, Otherwise, it ++ handles a query. ++ ++ ++ Q2: If queue is not empty, how many workers to wake? ++ ++ Solution: ++ We generally try to keep one thread per group active (threads handling ++ queries are considered active, unless they stuck in inside some "wait") ++ Thus, we will wake only one worker, and only if there is not active ++ threads currently,and listener is not going to handle a query. When we ++ don't wake, we hope that currently active threads will finish fast and ++ handle the queue. If this does not happen, timer thread will detect stall ++ and wake a worker. ++ ++ NOTE: Currently nothing is done to detect or prevent long queuing times. ++ A solutionc for the future would be to give up "one active thread per ++ group" principle, if events stay in the queue for too long, and just wake ++ more workers. ++ */ ++ ++ const bool listener_picks_event = threadpool_dedicated_listener? false : ++ (thread_group->high_prio_queue.is_empty() && thread_group->queue.is_empty()); ++ ++ /* ++ If listener_picks_event is set, listener thread will handle first event, ++ and put the rest into the queue. If listener_pick_event is not set, all ++ events go to the queue. ++ */ ++ for (int i = (listener_picks_event) ? 1 : 0; i < cnt; i++) { ++ connection_t *c = (connection_t *)native_event_get_userdata(&ev[i]); ++ if (connection_is_high_prio(*c)) { ++ c->tickets--; ++ thread_group->high_prio_queue.push_back(c); ++ } else { ++ c->tickets = tp_get_thdvar_high_prio_tickets(c->thd); ++ queue_push(thread_group, c); ++ } ++ } ++ ++ if (listener_picks_event) { ++ /* Handle the first event. */ ++ retval = (connection_t *)native_event_get_userdata(&ev[0]); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, dequeues[LISTENER]); ++ mysql_mutex_unlock(&thread_group->mutex); ++ break; ++ } ++ ++ /* The remaining threads can be created at most */ ++ int workers_in_need = (int)threadpool_toobusy - ++ thread_group->active_thread_count - thread_group->waiting_thread_count; ++ ++ /* There are no remaining threads and the thread group is stalled */ ++ if (workers_in_need <= 0 && thread_group->active_thread_count == 0) { ++ workers_in_need = 1; ++ } ++ ++ /* The number of threads that can be created and ++ the number of threads that are really needed, whichever is smaller */ ++ workers_in_need = workers_in_need > cnt ? cnt : workers_in_need; ++ ++ /* Wake up or create the required threads */ ++ for (int i = 0; i < workers_in_need; i++) { ++ /* We added some work items to queue, now wake a worker. */ ++ if (wake_thread(thread_group, false)) { ++ /* ++ Wake failed, hence groups has no idle threads. Now check if there are ++ any threads in the group except listener. ++ In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. ++ The queue is not empty, and listener is not going to handle ++ events. In order to drain the queue, we create a worker here. ++ Alternatively, we could just rely on timer to detect stall, and ++ create thread, but waiting for timer would be an inefficient and ++ pointless delay. ++ */ ++ create_worker(thread_group, false); ++ } ++ } ++ mysql_mutex_unlock(&thread_group->mutex); ++ } ++ DBUG_RETURN(retval); ++} ++ ++/** ++ Adjust thread counters in group or global ++ whenever thread is created or is about to exit ++ ++ @param thread_group ++ @param count - 1, when new thread is created ++ -1, when thread is about to exit ++*/ ++static void add_thread_count(thread_group_t *thread_group, ++ int32 count) noexcept { ++ thread_group->thread_count += count; ++ /* worker starts out and end in "active" state */ ++ thread_group->active_thread_count += count; ++ tp_stats.num_worker_threads.fetch_add(count, std::memory_order_relaxed); ++} ++ ++/** ++ Creates a new worker thread. ++ thread_mutex must be held when calling this function ++ ++ NOTE: in rare cases, the number of threads can exceed ++ threadpool_max_threads, because we need at least 2 threads ++ per group to prevent deadlocks (one listener + one worker) ++*/ ++static int create_worker(thread_group_t *thread_group, ++ bool due_to_stall) noexcept { ++ my_thread_handle thread_id; ++ bool max_threads_reached = false; ++ int err; ++ ++ DBUG_ENTER("create_worker"); ++ if (tp_stats.num_worker_threads.load(std::memory_order_relaxed) >= ++ (int)threadpool_max_threads && ++ thread_group->thread_count >= 2) { ++ err = 1; ++ max_threads_reached = true; ++ goto end; ++ } ++ ++ err = mysql_thread_create(key_worker_thread, &thread_id, ++ thread_group->pthread_attr, worker_main, ++ thread_group); ++ if (!err) { ++ thread_group->last_thread_creation_time = my_microsecond_getsystime(); ++ Global_THD_manager::get_instance()->inc_thread_created(); ++ add_thread_count(thread_group, 1); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, thread_creations); ++ ++ if (due_to_stall) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, thread_creations_due_to_stall); ++ } ++ } else { ++ set_my_errno(errno); ++ } ++ ++end: ++ if (err) { ++ print_pool_blocked_message(max_threads_reached); ++ } else { ++ pool_block_start = 0; /* Reset pool blocked timer, if it was set */ ++ } ++ ++ DBUG_RETURN(err); ++} ++ ++/** ++ Calculate microseconds throttling delay for thread creation. ++ ++ The value depends on how many threads are already in the group: ++ small number of threads means no delay, the more threads the larger ++ the delay. ++ ++ The actual values were not calculated using any scientific methods. ++ They just look right, and behave well in practice. ++ ++ TODO: Should throttling depend on thread_pool_stall_limit? ++*/ ++static ulonglong microsecond_throttling_interval( ++ const thread_group_t &thread_group) noexcept { ++ const int count = thread_group.thread_count; ++ ++ if (count < 4) return 0; ++ ++ if (count < 8) return 50 * 1000; ++ ++ if (count < 16) return 100 * 1000; ++ ++ return 200 * 1000; ++} ++ ++/** ++ Wakes a worker thread, or creates a new one. ++ ++ Worker creation is throttled, so we avoid too many threads ++ to be created during the short time. ++*/ ++static int wake_or_create_thread(thread_group_t *thread_group, ++ bool due_to_stall) { ++ DBUG_ENTER("wake_or_create_thread"); ++ ++ if (thread_group->shutdown) DBUG_RETURN(0); ++ ++ if (wake_thread(thread_group, due_to_stall) == 0) DBUG_RETURN(0); ++ ++ if (thread_group->thread_count > thread_group->connection_count) ++ DBUG_RETURN(-1); ++ ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if (thread_group->active_thread_count < ++ (1 + (int)threadpool_oversubscribe)) { ++ /* ++ We're better off creating a new thread here with no delay, either there ++ are not enough active workers, or they all are all blocking and there was no ++ idle thread to wakeup. Smells like a potential deadlock or very slowly ++ executing requests, e.g sleeps or user locks. ++ */ ++ DBUG_RETURN(create_worker(thread_group, due_to_stall)); ++ } ++ ++ const ulonglong now = my_microsecond_getsystime(); ++ const ulonglong time_since_last_thread_created = ++ (now - thread_group->last_thread_creation_time); ++ ++ /* Throttle thread creation. */ ++ if (time_since_last_thread_created > ++ microsecond_throttling_interval(*thread_group)) { ++ DBUG_RETURN(create_worker(thread_group, due_to_stall)); ++ } ++ ++ TP_INCREMENT_GROUP_COUNTER(thread_group, throttles); ++ DBUG_RETURN(-1); ++} ++ ++static int thread_group_init(thread_group_t *thread_group, ++ pthread_attr_t *thread_attr) noexcept { ++ DBUG_ENTER("thread_group_init"); ++ thread_group->pthread_attr = thread_attr; ++ mysql_mutex_init(key_group_mutex, &thread_group->mutex, nullptr); ++ thread_group->pollfd = -1; ++ thread_group->shutdown_pipe[0] = -1; ++ thread_group->shutdown_pipe[1] = -1; ++ thread_group->thread_count = 0; ++ thread_group->admin_port_thread_count = 0; ++ thread_group->dump_thread_count = 0; ++ thread_group->active_thread_count = 0; ++ thread_group->connection_count = 0; ++ thread_group->waiting_thread_count = 0; ++ thread_group->io_event_count = 0; ++ thread_group->queue_event_count = 0; ++ thread_group->shutdown = false; ++ thread_group->stalled = false; ++ DBUG_RETURN(0); ++} ++ ++static void thread_group_destroy(thread_group_t *thread_group) noexcept { ++ mysql_mutex_destroy(&thread_group->mutex); ++ if (thread_group->pollfd != -1) { ++ close(thread_group->pollfd); ++ thread_group->pollfd = -1; ++ } ++ for (int i = 0; i < 2; i++) { ++ if (thread_group->shutdown_pipe[i] != -1) { ++ close(thread_group->shutdown_pipe[i]); ++ thread_group->shutdown_pipe[i] = -1; ++ } ++ } ++} ++ ++/** ++ Wake sleeping thread from waiting list ++*/ ++static int wake_thread(thread_group_t *thread_group, bool due_to_stall) noexcept { ++ DBUG_ENTER("wake_thread"); ++ worker_thread_t *thread = thread_group->waiting_threads.front(); ++ if (thread) { ++ thread->woken = true; ++ thread_group->waiting_threads.remove(thread); ++ mysql_cond_signal(&thread->cond); ++ TP_INCREMENT_GROUP_COUNTER(thread_group, wakes); ++ if (due_to_stall) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, wakes_due_to_stall); ++ } ++ DBUG_RETURN(0); ++ } ++ DBUG_RETURN(1); /* no thread in waiter list => missed wakeup */ ++} ++ ++/** ++ Shutdown for thread group ++*/ ++static void thread_group_close(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("thread_group_close"); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ if (thread_group->thread_count == 0) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ thread_group_destroy(thread_group); ++ DBUG_VOID_RETURN; ++ } ++ ++ thread_group->shutdown = true; ++ thread_group->listener = nullptr; ++ ++ if (pipe(thread_group->shutdown_pipe)) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ ++ /* Wake listener */ ++ if (io_poll_associate_fd(thread_group->pollfd, ++ thread_group->shutdown_pipe[0], nullptr)) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ char c = 0; ++ if (write(thread_group->shutdown_pipe[1], &c, 1) < 0) { ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++ } ++ ++ /* Wake all workers. */ ++ while (wake_thread(thread_group, false) == 0) { ++ } ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/* ++ Add work to the queue. Maybe wake a worker if they all sleep. ++ ++ Currently, this function is only used when new connections need to ++ perform login (this is done in worker threads). ++*/ ++static void queue_put(thread_group_t *thread_group, connection_t *connection) { ++ DBUG_ENTER("queue_put"); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ connection->tickets = tp_get_thdvar_high_prio_tickets(connection->thd); ++ connection->enqueue_time = pool_timer.current_microtime; ++ ++ queue_push(thread_group, connection); ++ ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if (thread_group->active_thread_count < ++ 1 + (int)threadpool_oversubscribe) { ++ wake_or_create_thread(thread_group, false); ++ } ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Retrieve a connection with pending event. ++ ++ Pending event in our case means that there is either a pending login request ++ (if connection is not yet logged in), or there are unread bytes on the socket. ++ ++ If there are no pending events currently, thread will wait. ++ If timeout specified in abstime parameter passes, the function returns nullptr. ++ ++ @param current_thread - current worker thread ++ @param thread_group - current thread group ++ @param abstime - absolute wait timeout ++ ++ @return ++ connection with pending event. ++ nullptr is returned if timeout has expired,or on shutdown. ++*/ ++static connection_t *get_event(worker_thread_t *current_thread, ++ thread_group_t *thread_group, ++ struct timespec *abstime) { ++ DBUG_ENTER("get_event"); ++ connection_t *connection = nullptr; ++ int err = 0; ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ assert(thread_group->active_thread_count >= 0); ++ ++ for (;;) { ++ const bool oversubscribed = too_many_active_threads(*thread_group); ++ if (thread_group->shutdown) break; ++ ++ /* Check if queue is not empty */ ++ if (!oversubscribed) { ++ connection = queue_get(thread_group, WORKER); ++ if (connection) break; ++ } ++ ++ /* If there is currently no listener in the group, become one. */ ++ if (!thread_group->listener) { ++ thread_group->listener = current_thread; ++ thread_group->active_thread_count--; ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ connection = listener(thread_group); ++ ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count++; ++ /* There is no listener anymore, it just returned. */ ++ thread_group->listener = nullptr; ++ break; ++ } ++ ++ /* ++ Last thing we try before going to sleep is to ++ pick a single event via epoll, without waiting (timeout 0) ++ */ ++ if (!oversubscribed) { ++ native_event nev; ++ if (io_poll_wait(thread_group->pollfd, &nev, 1, 0) == 1) { ++ thread_group->io_event_count++; ++ TP_INCREMENT_GROUP_COUNTER(thread_group, polls[WORKER]); ++ connection = (connection_t *)native_event_get_userdata(&nev); ++ ++ /* ++ Since we are going to perform an out-of-order event processing for the ++ connection, first check whether it is eligible for high priority ++ processing. We can get here even if there are queued events, so it ++ must either have a high priority ticket, or there must be not too many ++ busy threads (as if it was coming from a low priority queue). ++ */ ++ if (connection_is_high_prio(*connection)) ++ connection->tickets--; ++ else if (too_many_busy_threads(*thread_group)) { ++ /* ++ Not eligible for high priority processing. Restore tickets and put ++ it into the low priority queue. ++ */ ++ connection->tickets = tp_get_thdvar_high_prio_tickets(connection->thd); ++ thread_group->queue.push_back(connection); ++ connection = nullptr; ++ } ++ ++ if (connection) { ++ TP_INCREMENT_GROUP_COUNTER(thread_group, dequeues[WORKER]); ++ thread_group->queue_event_count++; ++ break; ++ } ++ } ++ } ++ ++ /* And now, finally sleep */ ++ current_thread->woken = false; /* wake() sets this to true */ ++ ++ /* ++ Add current thread to the head of the waiting list and wait. ++ It is important to add thread to the head rather than tail ++ as it ensures LIFO wakeup order (hot caches, working inactivity timeout) ++ */ ++ thread_group->waiting_threads.push_front(current_thread); ++ ++ thread_group->active_thread_count--; ++ if (abstime) { ++ err = mysql_cond_timedwait(¤t_thread->cond, &thread_group->mutex, ++ abstime); ++ } else { ++ err = mysql_cond_wait(¤t_thread->cond, &thread_group->mutex); ++ } ++ thread_group->active_thread_count++; ++ ++ if (!current_thread->woken) { ++ /* ++ Thread was not signalled by wake(), it might be a spurious wakeup or ++ a timeout. Anyhow, we need to remove ourselves from the list now. ++ If thread was explicitly woken, than caller removed us from the list. ++ */ ++ thread_group->waiting_threads.remove(current_thread); ++ } ++ ++ if (err) break; ++ } ++ ++ thread_group->stalled = false; ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ DBUG_RETURN(connection); ++} ++ ++/** ++ Tells the pool that worker starts waiting on IO, lock, condition, ++ sleep() or similar. ++*/ ++ ++static void wait_begin(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("wait_begin"); ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count--; ++ thread_group->waiting_thread_count++; ++ ++ assert(thread_group->active_thread_count >= 0); ++ assert(thread_group->connection_count > 0); ++ ++#ifdef THREADPOOL_CREATE_THREADS_ON_WAIT ++ /* In order to achieve the best running performance of the ++ number of threads, the conditions for the wake-up or ++ creation of worker threads are relaxed. */ ++ if ((thread_group->active_thread_count < (1 + (int)threadpool_oversubscribe)) && ++ (!queues_are_empty(*thread_group) || !thread_group->listener)) { ++ /* ++ Group might stall while this thread waits, thus wake ++ or create a worker to prevent stall. ++ */ ++ wake_or_create_thread(thread_group); ++ } ++#endif ++ ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Tells the pool has finished waiting. ++*/ ++static void wait_end(thread_group_t *thread_group) noexcept { ++ DBUG_ENTER("wait_end"); ++ mysql_mutex_lock(&thread_group->mutex); ++ thread_group->active_thread_count++; ++ thread_group->waiting_thread_count--; ++ mysql_mutex_unlock(&thread_group->mutex); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Allocate/initialize a new connection structure. ++*/ ++ ++static connection_t *alloc_connection(THD *thd) noexcept { ++ DBUG_ENTER("alloc_connection"); ++ DBUG_EXECUTE_IF("simulate_tp_alloc_connection_oom", DBUG_RETURN(nullptr);); ++ ++ connection_t *connection = (connection_t *)my_malloc( ++ PSI_NOT_INSTRUMENTED /*key_memory_thread_pool_connection*/, ++ sizeof(connection_t), 0); ++ if (connection) { ++ connection->thd = thd; ++ connection->waiting = false; ++ connection->logged_in = false; ++ connection->bound_to_poll_descriptor = false; ++ connection->abs_wait_timeout = ULLONG_MAX; ++ connection->tickets = 0; ++ } ++ DBUG_RETURN(connection); ++} ++ ++/** ++ Add a new connection to thread pool.. ++*/ ++ ++bool tp_add_connection( ++ Channel_info *channel_info) { ++ DBUG_ENTER("Thread_pool_connection_handler::add_connection"); ++ ++ THD *const thd = channel_info->create_thd(); ++ ++ if (unlikely(!thd)) { ++ channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false); ++ DBUG_RETURN(true); ++ } ++ ++ connection_t *const connection = alloc_connection(thd); ++ ++ if (unlikely(!connection)) { ++ thd->get_protocol_classic()->end_net(); ++ delete thd; ++ // channel will be closed by send_error_and_close_channel() ++ channel_info->send_error_and_close_channel(ER_OUT_OF_RESOURCES, 0, false); ++ DBUG_RETURN(true); ++ } ++ ++ delete channel_info; ++ ++ thd->set_new_thread_id(); ++ thd->start_utime = my_micro_time(); ++ ++ threadpool_thds.insert(connection); ++ Global_THD_manager::get_instance()->add_thd(thd); ++ ++ thd->scheduler.data = connection; ++ ++ /* Assign connection to a group. */ ++ thread_group_t *group = &all_groups[thd->thread_id() % group_count]; ++ ++ connection->thread_group = group; ++ ++ if (thd->is_admin_connection()) { ++ my_thread_handle thread_id; ++ mysql_mutex_lock(&group->mutex); ++ int err = mysql_thread_create(key_worker_thread, &thread_id, ++ group->pthread_attr, admin_port_worker_main, connection); ++ ++ if (err) { ++ set_my_errno(errno); ++ print_pool_blocked_message(false); ++ } else { ++ group->admin_port_thread_count++; ++ } ++ mysql_mutex_unlock(&group->mutex); ++ } else { ++ mysql_mutex_lock(&group->mutex); ++ group->connection_count++; ++ mysql_mutex_unlock(&group->mutex); ++ ++ /* ++ Add connection to the work queue. Actual login ++ will be done by a worker thread. ++ */ ++ queue_put(group, connection); ++ } ++ ++ DBUG_RETURN(false); ++} ++ ++/** ++ Terminate connection. ++*/ ++static void connection_abort(connection_t *connection) { ++ DBUG_ENTER("connection_abort"); ++ threadpool_thds.erase(connection); ++ ++ thread_group_t *group = connection->thread_group; ++ bool is_admin_port = connection->thd->is_admin_connection(); ++ threadpool_remove_connection(connection->thd); ++ ++ if (!is_admin_port) { ++ mysql_mutex_lock(&group->mutex); ++ group->connection_count--; ++ mysql_mutex_unlock(&group->mutex); ++ } ++ ++ my_free(connection); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Detach connection. ++*/ ++static void connection_detach(connection_t *connection) { ++ DBUG_ENTER("connection_detach"); ++ threadpool_thds.erase(connection); ++ ++ thread_group_t *group = connection->thread_group; ++ bool is_admin_port = connection->thd->is_admin_connection(); ++ Vio *const vio = connection->thd->get_protocol_classic()->get_vio(); ++ const int fd = mysql_socket_getfd(vio->mysql_socket); ++ mysql_mutex_lock(&group->mutex); ++ io_poll_disassociate_fd(group->pollfd, fd); ++ connection->bound_to_poll_descriptor = false; ++ mysql_mutex_unlock(&group->mutex); ++ ++ if (!is_admin_port) { ++ mysql_mutex_lock(&group->mutex); ++ group->connection_count--; ++ mysql_mutex_unlock(&group->mutex); ++ } ++ ++ my_thread_handle thread_id; ++ ++ if (mysql_thread_create(key_worker_thread, &thread_id, group->pthread_attr, ++ connection_detach_worker, connection->thd)) { ++ threadpool_remove_connection(connection->thd); ++ } ++ ++ my_free(connection); ++ DBUG_VOID_RETURN; ++} ++ ++ ++static void *connection_detach_worker(void *param) { ++ my_thread_init(); ++ DBUG_ENTER("connection_detach_worker"); ++ THD *thd = static_cast(param); ++ assert(thd != nullptr); ++ thread_attach(thd); ++ ++ while (1) { ++ if (threadpool_process_request(thd)) { ++ break; ++ } ++ } ++ ++ threadpool_remove_connection(thd); ++ return nullptr; ++} ++ ++/** ++ MySQL scheduler callback : kill connection ++*/ ++ ++void tp_post_kill_notification(THD *thd) noexcept { ++ DBUG_ENTER("tp_post_kill_notification"); ++ if (current_thd == thd || thd->system_thread) { ++ DBUG_VOID_RETURN; ++ } ++ ++ Vio *vio = thd->get_protocol_classic()->get_vio(); ++ if (vio) vio_cancel(vio, SHUT_RD); ++ DBUG_VOID_RETURN; ++} ++ ++alignas(CPU_LEVEL1_DCACHE_LINESIZE) std::atomic tp_waits[THD_WAIT_LAST]; ++ ++/** ++ MySQL scheduler callback: wait begin ++*/ ++void tp_wait_begin(THD *thd, int type MY_ATTRIBUTE((unused))) { ++ DBUG_ENTER("tp_wait_begin"); ++ ++ if (thd == nullptr) { ++ DBUG_VOID_RETURN; ++ } ++ ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ ++ if (connection && connection->thd && ++ !connection->thd->is_admin_connection()) { ++ assert(!connection->waiting); ++ connection->waiting = true; ++ assert(type > 0 && type < THD_WAIT_LAST); ++ tp_waits[type]++; ++ wait_begin(connection->thread_group); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ MySQL scheduler callback: wait end ++*/ ++ ++void tp_wait_end(THD *thd) { ++ DBUG_ENTER("tp_wait_end"); ++ ++ if (thd == nullptr) { ++ DBUG_VOID_RETURN; ++ } ++ connection_t *connection = (connection_t *)thd->scheduler.data; ++ ++ if (connection && connection->thd && ++ !connection->thd->is_admin_connection()) { ++ assert(connection->waiting); ++ connection->waiting = false; ++ wait_end(connection->thread_group); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++static void set_next_timeout_check(ulonglong abstime) { ++ DBUG_ENTER("set_next_timeout_check"); ++ while (abstime < pool_timer.next_timeout_check.load()) { ++ uint64 old = pool_timer.next_timeout_check.load(); ++ pool_timer.next_timeout_check.compare_exchange_weak(old, abstime); ++ } ++ DBUG_VOID_RETURN; ++} ++ ++ ++ ++ inline ulong get_wait_timeout(THD *thd) noexcept { ++ return thd->variables.net_wait_timeout; ++ } ++ ++/** ++ Set wait timeout for connection. ++*/ ++ ++static void set_wait_timeout(connection_t *c) noexcept { ++ DBUG_ENTER("set_wait_timeout"); ++ /* ++ Calculate wait deadline for this connection. ++ Instead of using my_microsecond_getsystime() which has a syscall ++ overhead, use pool_timer.current_microtime and take ++ into account that its value could be off by at most ++ one tick interval. ++ */ ++ ++ c->abs_wait_timeout = ++ pool_timer.current_microtime.load(std::memory_order_relaxed) + ++ 1000LL * pool_timer.tick_interval + ++ 1000000LL * get_wait_timeout(c->thd); ++ ++ set_next_timeout_check(c->abs_wait_timeout); ++ DBUG_VOID_RETURN; ++} ++ ++/** ++ Handle a (rare) special case,where connection needs to ++ migrate to a different group because group_count has changed ++ after thread_pool_size setting. ++*/ ++ ++static int change_group(connection_t *c, thread_group_t *old_group, ++ thread_group_t *new_group) { ++ assert(c->thread_group == old_group); ++ ++ /* Remove connection from the old group. */ ++ if (c->bound_to_poll_descriptor) { ++ Vio *const vio = c->thd->get_protocol_classic()->get_vio(); ++ const int fd = mysql_socket_getfd(vio->mysql_socket); ++ mysql_mutex_lock(&old_group->mutex); ++ io_poll_disassociate_fd(old_group->pollfd, fd); ++ c->bound_to_poll_descriptor = false; ++ } else { ++ mysql_mutex_lock(&old_group->mutex); ++ } ++ c->thread_group->connection_count--; ++ mysql_mutex_unlock(&old_group->mutex); ++ ++ /* Add connection to the new group. */ ++ mysql_mutex_lock(&new_group->mutex); ++ c->thread_group = new_group; ++ new_group->connection_count++; ++ /* Ensure that there is a listener in the new group. */ ++ int ret = 0; ++ if (!new_group->thread_count) ret = create_worker(new_group, false); ++ mysql_mutex_unlock(&new_group->mutex); ++ return ret; ++} ++ ++static int start_io(connection_t *connection) { ++ /* ++ Usually, connection will stay in the same group for the entire ++ connection's life. However, we do allow group_count to ++ change at runtime, which means in rare cases when it changes is ++ connection should need to migrate to another group, this ensures ++ to ensure equal load between groups. ++ ++ So we recalculate in which group the connection should be, based ++ on thread_id and current group count, and migrate if necessary. ++ */ ++ thread_group_t *const group = ++ &all_groups[connection->thd->thread_id() % group_count]; ++ ++ if (group != connection->thread_group) { ++ if (change_group(connection, connection->thread_group, group)) return -1; ++ } ++ ++ /* ++ Bind to poll descriptor if not yet done. ++ */ ++ Vio *vio = connection->thd->get_protocol_classic()->get_vio(); ++ int fd = mysql_socket_getfd(vio->mysql_socket); ++ if (!connection->bound_to_poll_descriptor) { ++ connection->bound_to_poll_descriptor = true; ++ return io_poll_associate_fd(group->pollfd, fd, connection); ++ } ++ ++ return io_poll_start_read(group->pollfd, fd, connection); ++} ++ ++static void handle_event(connection_t *connection) { ++ DBUG_ENTER("handle_event"); ++ int err = 0; ++ ++ while (1) { ++ if (!connection->logged_in) { ++ err = threadpool_add_connection(connection->thd); ++ connection->logged_in = true; ++ } else { ++ err = threadpool_process_request(connection->thd); ++ } ++ ++ if (err) { ++ goto end; ++ } ++ ++ if (connection->thd == thd_to_detach) { ++ connection_detach(connection); ++ goto end_return; ++ } ++ ++ set_wait_timeout(connection); ++ ++ if (!connection_is_worker_continue(*connection)) { ++ break; ++ } ++ } ++ ++ if (!connection->thd->is_admin_connection()) { ++ err = start_io(connection); ++ } ++ ++end: ++ if (err || connection->thd->is_admin_connection()) { ++ connection_abort(connection); ++ } ++ ++end_return: ++ DBUG_VOID_RETURN; ++} ++ ++static void *admin_port_worker_main(void *param) { ++ my_thread_init(); ++ DBUG_ENTER("admin_port_worker_main"); ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread_account) ++ (nullptr, 0, nullptr, 0); ++#endif ++ ++ connection_t *connection = static_cast(param); ++ assert(connection != nullptr); ++ assert(connection->thread_group != nullptr); ++ thread_group_t *group = connection->thread_group; ++ ++ handle_event(connection); ++ ++ mysql_mutex_lock(&group->mutex); ++ group->admin_port_thread_count--; ++ mysql_mutex_unlock(&group->mutex); ++ ++ my_thread_end(); ++ return nullptr; ++} ++ ++/** ++ Worker thread's main ++*/ ++static void *worker_main(void *param) { ++ my_thread_init(); ++ ++ DBUG_ENTER("worker_main"); ++ ++ thread_group_t *thread_group = static_cast(param); ++ assert(thread_group != nullptr); ++ ++ if (threadpool_sched_affinity) { ++ group_affinity.bind_numa((thread_group - all_groups) / sizeof(thread_group_t)); ++ } ++ ++ /* Init per-thread structure */ ++ worker_thread_t this_thread; ++ mysql_cond_init(key_worker_cond, &this_thread.cond); ++ this_thread.thread_group = thread_group; ++ this_thread.event_count = 0; ++ ++#ifdef HAVE_PSI_THREAD_INTERFACE ++ PSI_THREAD_CALL(set_thread_account) ++ (nullptr, 0, nullptr, 0); ++#endif ++ ++ /* Run event loop */ ++ for (;;) { ++ struct timespec ts; ++ set_timespec(&ts, threadpool_idle_timeout); ++ connection_t *connection = get_event(&this_thread, thread_group, &ts); ++ ++ if (!connection) { ++ break; ++ } ++ ++ this_thread.event_count++; ++ handle_event(connection); ++ } ++ ++ /* Thread shutdown: cleanup per-worker-thread structure. */ ++ mysql_cond_destroy(&this_thread.cond); ++ ++ bool last_thread = false; /* last thread in group exits */ ++ mysql_mutex_lock(&thread_group->mutex); ++ add_thread_count(thread_group, -1); ++ last_thread= ((thread_group->thread_count == 0) && thread_group->shutdown); ++ mysql_mutex_unlock(&thread_group->mutex); ++ ++ /* Last thread in group exits and pool is terminating, destroy group.*/ ++ if (last_thread) { ++ thread_group_destroy(thread_group); ++ } ++ ++ my_thread_end(); ++ return nullptr; ++} ++ ++bool tp_init() { ++ DBUG_ENTER("tp_init"); ++ threadpool_started = true; ++ group_affinity.init(); ++ ++ for (uint i = 0; i < array_elements(all_groups); i++) { ++ thread_group_init(&all_groups[i], get_connection_attrib()); ++ } ++ tp_set_threadpool_size(threadpool_size); ++ if (group_count == 0) { ++ /* Something went wrong */ ++ sql_print_error("Can't set threadpool size to %d", threadpool_size); ++ DBUG_RETURN(true); ++ } ++#ifdef HAVE_PSI_INTERFACE ++ mysql_mutex_register("threadpool", mutex_list, array_elements(mutex_list)); ++ mysql_cond_register("threadpool", cond_list, array_elements(cond_list)); ++ mysql_thread_register("threadpool", thread_list, array_elements(thread_list)); ++#endif ++ ++ pool_timer.tick_interval = threadpool_stall_limit; ++ start_timer(&pool_timer); ++ DBUG_RETURN(false); ++} ++ ++void tp_end_thread() { ++ if (!threadpool_started) { ++ return; ++ } ++ ++ while (!threadpool_thds.empty()) { ++ my_sleep(10000); ++ } ++ ++ stop_timer(&pool_timer); ++ ++ for (uint i = 0; i < array_elements(all_groups); i++) { ++ thread_group_close(&all_groups[i]); ++ } ++ ++ threadpool_started = false; ++} ++ ++void tp_end() { ++ DBUG_ENTER("tp_end"); ++ threadpool_thds.killConns(); ++ ++ std::thread exit_tp(tp_end_thread); ++ exit_tp.detach(); ++ DBUG_VOID_RETURN; ++} ++ ++/** Ensure that poll descriptors are created when threadpool_size changes */ ++void tp_set_threadpool_size(uint size) noexcept { ++ if (!threadpool_started) return; ++ ++ bool success = true; ++ for (uint i = 0; i < size; i++) { ++ thread_group_t *group = &all_groups[i]; ++ mysql_mutex_lock(&group->mutex); ++ if (group->pollfd == -1) { ++ group->pollfd = io_poll_create(); ++ success = (group->pollfd >= 0); ++ if (!success) { ++ sql_print_error("io_poll_create() failed, errno=%d\n", errno); ++ break; ++ } ++ } ++ mysql_mutex_unlock(&all_groups[i].mutex); ++ if (!success) { ++ group_count = i; ++ return; ++ } ++ } ++ group_count = size; ++} ++ ++void tp_set_threadpool_stall_limit(uint limit) noexcept { ++ if (!threadpool_started) { ++ return; ++ } ++ ++ mysql_mutex_lock(&(pool_timer.mutex)); ++ pool_timer.tick_interval = limit; ++ mysql_mutex_unlock(&(pool_timer.mutex)); ++ mysql_cond_signal(&(pool_timer.cond)); ++} ++ ++/** ++ Calculate number of idle/waiting threads in the pool. ++ ++ Sum idle threads over all groups. ++ Don't do any locking, it is not required for stats. ++*/ ++int tp_get_idle_thread_count() noexcept { ++ int sum = 0; ++ for (uint i = 0; ++ i < array_elements(all_groups) && (all_groups[i].pollfd >= 0); i++) { ++ sum += (all_groups[i].thread_count - all_groups[i].active_thread_count); ++ } ++ return sum; ++} ++ ++/* Report threadpool problems */ ++ ++/** ++ Delay in microseconds, after which "pool blocked" message is printed. ++ (30 sec == 30 Mio usec) ++*/ ++#define BLOCK_MSG_DELAY 30 * 1000000 ++ ++#define MAX_THREADS_REACHED_MSG \ ++ "Threadpool could not create additional thread to handle queries, because the \ ++number of allowed threads was reached. Increasing 'thread_pool_max_threads' \ ++parameter can help in this situation.\n \ ++If 'admin_port' parameter is set, you can still connect to the database with \ ++superuser account (it must be TCP connection using admin_port as TCP port) \ ++and troubleshoot the situation. \ ++A likely cause of pool blocks are clients that lock resources for long time. \ ++'show processlist' or 'show engine innodb status' can give additional hints." ++ ++#define CREATE_THREAD_ERROR_MSG "Can't create threads in threadpool (errno=%d)." ++ ++/** ++ Write a message when blocking situation in threadpool occurs. ++ The message is written only when pool blocks for BLOCK_MSG_DELAY (30) seconds. ++ It will be just a single message for each blocking situation (to prevent ++ log flood). ++*/ ++static void print_pool_blocked_message(bool max_threads_reached) noexcept { ++ ulonglong now = my_microsecond_getsystime(); ++ static bool msg_written = false; ++ ++ if (pool_block_start == 0) { ++ pool_block_start = now; ++ msg_written = false; ++ } ++ ++ if (!msg_written && ((now > pool_block_start + BLOCK_MSG_DELAY) || ++ (now == pool_block_start))) { ++ if (max_threads_reached) ++ sql_print_error(MAX_THREADS_REACHED_MSG); ++ else ++ sql_print_error(CREATE_THREAD_ERROR_MSG, my_errno); ++ ++ if (now > pool_block_start) { ++ sql_print_information("Threadpool has been blocked for %u seconds\n", ++ (uint)((now - pool_block_start) / 1000000)); ++ } ++ /* avoid reperated messages for the same blocking situation */ ++ msg_written = true; ++ } ++} +diff --git a/plugin/thread_pool/threadpool_unix.h b/plugin/thread_pool/threadpool_unix.h +new file mode 100644 +index 00000000000..3c561f2da75 +--- /dev/null ++++ b/plugin/thread_pool/threadpool_unix.h +@@ -0,0 +1,135 @@ ++/* Copyright (C) 2012 Monty Program Ab ++ Copyright (C) 2022 Huawei Technologies Co., Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; version 2 of the License. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, ++ USA */ ++ ++#ifndef THREADPOOL_UNIX_H_ ++#define THREADPOOL_UNIX_H_ ++ ++#include "mysql/service_thd_wait.h" ++#include "sql/sql_plist.h" ++#include "sql/mysqld.h" ++#include "threadpool.h" ++#include "violite.h" ++#include "numa_affinity_manager.h" ++ ++#ifdef __linux__ ++#include ++typedef struct epoll_event native_event; ++#endif ++#if defined(__FreeBSD__) || defined(__APPLE__) ++#include ++typedef struct kevent native_event; ++#endif ++#if defined(__sun) ++#include ++typedef port_event_t native_event; ++#endif ++ ++#define my_microsecond_getsystime() (my_getsystime()/10) ++ ++struct thread_group_t; ++ ++/* Per-thread structure for workers */ ++struct worker_thread_t { ++ ulonglong event_count; /* number of request handled by this thread */ ++ thread_group_t *thread_group; ++ worker_thread_t *next_in_list; ++ worker_thread_t **prev_in_list; ++ ++ mysql_cond_t cond; ++ bool woken; ++}; ++ ++typedef I_P_List< ++ worker_thread_t, ++ I_P_List_adapter> ++ worker_list_t; ++ ++struct connection_t { ++ THD *thd; ++ thread_group_t *thread_group; ++ connection_t *next_in_queue; ++ connection_t **prev_in_queue; ++ ulonglong abs_wait_timeout; ++ ulonglong enqueue_time; ++ bool logged_in; ++ bool bound_to_poll_descriptor; ++ bool waiting; ++ uint tickets; ++}; ++ ++typedef I_P_List, ++ I_P_List_counter, I_P_List_fast_push_back> ++ connection_queue_t; ++ ++const int NQUEUES = 2; /* We have high and low priority queues */ ++ ++enum operation_origin ++{ ++ WORKER, ++ LISTENER ++}; ++ ++struct thread_group_counters_t ++{ ++ ulonglong thread_creations; ++ ulonglong thread_creations_due_to_stall; ++ ulonglong wakes; ++ ulonglong wakes_due_to_stall; ++ ulonglong throttles; ++ ulonglong stalls; ++ ulonglong dequeues[2]; ++ ulonglong polls[2]; ++}; ++ ++struct alignas(128) thread_group_t { ++ mysql_mutex_t mutex; ++ connection_queue_t queue; ++ connection_queue_t high_prio_queue; ++ worker_list_t waiting_threads; ++ worker_thread_t *listener; ++ pthread_attr_t *pthread_attr; ++ int pollfd; ++ int thread_count; ++ int admin_port_thread_count; ++ int dump_thread_count; ++ int active_thread_count; ++ int connection_count; ++ int waiting_thread_count; ++ /* Stats for the deadlock detection timer routine.*/ ++ int io_event_count; ++ int queue_event_count; ++ ulonglong last_thread_creation_time; ++ int shutdown_pipe[2]; ++ bool shutdown; ++ bool stalled; ++ thread_group_counters_t counters; ++ char padding[320 - sizeof(thread_group_counters_t)]; ++}; ++ ++static_assert(sizeof(thread_group_t) == 512, ++ "sizeof(thread_group_t) must be 512 to avoid false sharing"); ++ ++#define TP_INCREMENT_GROUP_COUNTER(group, var) do {group->counters.var++;}while(0) ++ ++extern thread_group_t all_groups[MAX_THREAD_GROUPS]; ++extern numa_affinity_manager group_affinity; ++ ++#endif // THREADPOOL_UNIX_H_ ++ diff --git a/my.cnf b/my.cnf new file mode 100644 index 0000000000000000000000000000000000000000..f22589484a69114aace571e45cbfe8846473f7e9 --- /dev/null +++ b/my.cnf @@ -0,0 +1,81 @@ +[mysqld_safe] +log-error=/data/mysql/log/mysql.log +pid-file=/data/mysql/run/mysqld.pid + +[client] +socket=/data/mysql/run/mysql.sock +default-character-set=utf8 + +[mysqld] +server-id=1 +basedir=/usr/local/mysql +tmpdir=/data/mysql/tmp +datadir=/data/mysql/data +socket=/data/mysql/run/mysql.sock +port=3306 +user=root +default_authentication_plugin=mysql_native_password + +max_connections=2000 #设置最大连接数 +back_log=4000 #设置会话请求缓存个数 +performance_schema=OFF #关闭性能模式 +max_prepared_stmt_count=128000 +#transaction_isolation=READ-COMMITTED + +#file +innodb_file_per_table=on #设置每个表一个文件 +innodb_log_file_size=2048M #设置logfile大小 +innodb_log_files_in_group=32 #设置logfile组个数 +innodb_open_files=1000 #设置最大打开表个数 +table_open_cache_instances=64 + +#buffers +innodb_buffer_pool_size=22G #设置buffer pool size,一般为服务器内存60% +innodb_buffer_pool_instances=16 #设置buffer pool instance个数,提高并发能力 +innodb_log_buffer_size=2048M #设置log buffer size大小 + +#tune +default_time_zone='system' +sync_binlog=1 #设置每次sync_binlog事务提交刷盘 +innodb_flush_log_at_trx_commit=1 #每次事务提交时MySQL都会把log buffer的数据写入log file,并且flush(刷到磁盘)中去 +innodb_use_native_aio=1 #开启异步IO +innodb_spin_wait_delay=5 #设置spin_wait_delay 参数,防止进入系统自旋 +innodb_sync_spin_loops=20 #设置spin_loops 循环次数,防止进入系统自旋 +innodb_flush_method=O_DIRECT #设置innodb数据文件及redo log的打开、刷写模式 +innodb_io_capacity=30000 # 设置innodb 后台线程每秒最大iops上限 +innodb_io_capacity_max=40000 #设置压力下innodb 后台线程每秒最大iops上限 +innodb_lru_scan_depth=9000 #设置page cleaner线程每次刷脏页的数量 +innodb_page_cleaners=16 #设置将脏数据写入到磁盘的线程数 +innodb_spin_wait_pause_multiplier=5 #设置spin lock循环随机数 + +#perf special +innodb_flush_neighbors=0 #检测该页所在区(extent)的所有页,如果是脏页,那么一起进行刷新,SSD关闭该功能 +innodb_write_io_threads=24 #设置写线程数 +innodb_read_io_threads=16 #设置读线程数 +innodb_purge_threads=32 #设置回收已经使用并分配的undo页线程数 + +sql_mode=STRICT_TRANS_TABLES,NO_ENGINE_SUBSTITUTION,NO_AUTO_VALUE_ON_ZERO,STRICT_ALL_TABLES + +#skip_log_bin +log-bin=mysql-bin +ssl=0 #关闭ssl +table_open_cache=30000 #设置打开表的数量 +max_connect_errors=2000 +innodb_adaptive_hash_index=0 + +#thread-pool +plugin-load-add=thread_pool.so #安装线程池插件,需重启数据库 +#thread_pool_size=56 #默认cpu核数,小线程组模式下可配置为4倍NUMA数 +#thread_pool_dedicated_listener=OFF #默认为OFF,小线程组模式下配置为ON +#thread_pool_oversubscribe=3 #默认为3,小线程组模式下配置该值=基线版本最优性能时的连接数/thread_pool_size的配置值 +#thread_pool_toobusy=13 #默认为3,小线程组模式下配置该值=thread_pool_oversubscribe + +#sched-affinity +#sched_affinity_numa_aware=ON #前台线程numa分组,global级别参数 +#sched_affinity_foreground_thread= #前台线程允许运行的CPU core(s),减号(-)表示范围,可用逗号隔开 +#sched_affinity_log_writer= #MySQL log_writer线程允许运行的CPU core(s) +#sched_affinity_log_flusher= #MySQL log_flusher线程允许运行的CPU core(s) +#sched_affinity_log_write_notifier= #MySQL log_write_notifier线程允许运行的CPU core(s) +#sched_affinity_log_flush_notifier= #MySQL log_flush_notifier线程允许运行的CPU core(s) +#sched_affinity_log_checkpointer= #MySQL log_checkpointer线程允许运行的CPU core(s) +#sched_affinity_purge_coordinator= #MySQL purge_coordinator线程允许运行的CPU core(s) diff --git a/mysql-boost-8.0.25.tar.gz b/mysql-boost-8.0.25.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..8004178a36b1073b705c36468f15e0a9ef697b2a Binary files /dev/null and b/mysql-boost-8.0.25.tar.gz differ diff --git a/mysql.spec b/mysql.spec new file mode 100644 index 0000000000000000000000000000000000000000..de0a1afcd40798065b870be91099caaed293ee19 --- /dev/null +++ b/mysql.spec @@ -0,0 +1,161 @@ +%define _python_bytecompile_errors_terminate_build 0 +name: boostkit-mysql +Version: 8.0.25 +Release: 1 +License: GPLv2 +URL: http://www.mysql.com +Group: applications/database +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root + +Source0: mysql-boost-8.0.25.tar.gz +Source1: my.cnf + +Patch0001: KunpengBoostKit22.0.RC4-CODE-THREADPOOL-FOR-MySQL-8.0.25.patch +Patch0002: KunpengBoostKit22.0.RC4-CODE-SCHED-AFFINITY-FOR-MySQL-8.0.25.patch + +%if 0%{?rhel} >= 8 +BuildRequires: cmake >= 3.6.1 +BuildRequires: libtirpc-devel +BuildRequires: rpcgen +%else +BuildRequires: cmake3 >= 3.6.1 +%endif +BuildRequires: bison >= 2.1 +BuildRequires: perl +BuildRequires: gcc-c++ libtirpc-devel rpcgen +%{?el7:BuildRequires: perl(Env)} +%{?el8:BuildRequires: perl(Env)} +BuildRequires: perl(Carp) +BuildRequires: perl(Config) +BuildRequires: perl(Cwd) +BuildRequires: perl(Data::Dumper) +BuildRequires: perl(English) +BuildRequires: perl(Errno) +BuildRequires: perl(Exporter) +BuildRequires: perl(Fcntl) +BuildRequires: perl(File::Basename) +BuildRequires: perl(File::Copy) +BuildRequires: perl(File::Find) +BuildRequires: perl(File::Path) +BuildRequires: perl(File::Spec) +BuildRequires: perl(File::Spec::Functions) +BuildRequires: perl(File::Temp) +BuildRequires: perl(Getopt::Long) +BuildRequires: perl(IO::File) +BuildRequires: perl(IO::Handle) +BuildRequires: perl(IO::Pipe) +BuildRequires: perl(IO::Select) +BuildRequires: perl(IO::Socket) +BuildRequires: perl(IO::Socket::INET) +BuildRequires: perl(JSON) +BuildRequires: perl(Memoize) +BuildRequires: perl(POSIX) +BuildRequires: perl(Sys::Hostname) +BuildRequires: perl(Time::HiRes) +BuildRequires: perl(Time::localtime) +BuildRequires: time +BuildRequires: libaio-devel +BuildRequires: ncurses-devel +BuildRequires: numactl-devel +BuildRequires: numactl-libs +BuildRequires: numactl +BuildRequires: openssl-devel +BuildRequires: zlib-devel +BuildRequires: cyrus-sasl-devel +BuildRequires: openldap-devel + +Requires: numactl-devel +Requires: numactl-libs +Requires: numactl +Requires: perl(Carp) +Requires: perl(Config) +Requires: perl(Cwd) +Requires: perl(Data::Dumper) +Requires: perl(English) +Requires: perl(Errno) +Requires: perl(Exporter) +Requires: perl(Fcntl) +Requires: perl(File::Basename) +Requires: perl(File::Copy) +Requires: perl(File::Find) +Requires: perl(File::Path) +Requires: perl(File::Spec) +Requires: perl(File::Spec::Functions) +Requires: perl(File::Temp) +Requires: perl(Getopt::Long) +Requires: perl(IO::File) +Requires: perl(IO::Handle) +Requires: perl(IO::Pipe) +Requires: perl(IO::Select) +Requires: perl(IO::Socket) +Requires: perl(IO::Socket::INET) +Requires: perl(JSON) +Requires: perl(Memoize) +Requires: perl(POSIX) +Requires: perl(Sys::Hostname) +Requires: perl(Time::HiRes) +Requires: perl(Time::localtime) + +Summary: A very fast and reliable SQL database server + +%description +The MySQL(TM) software delivers a very fast, multi-threaded, multi-user, +and robust SQL (Structured Query Language) databae server. + +%define mysql_server_path /usr/local/mysql +%define mysql_conf_path /etc +%define MYSQL_USER mysql +%define MYSQL_GROUP mysql + +%prep +%setup -q -n mysql-%{version} +%ifarch aarch64 +%autopatch -p1 +%endif + +%build +cmake . -DCMAKE_INSTALL_PREFIX=%{mysql_server_path} -DDOWNLOAD_BOOST=0 -DWITH_BOOST=./boost -DFORCE_INSOURCE_BUILD=1 +make %{?_smp_mflags} + +%install +rm -rf %{buildroot} +make DESTDIR=%{buildroot} install +mkdir -p %{buildroot}%{mysql_conf_path} +cp %{_sourcedir}/my.cnf $RPM_BUILD_ROOT%{mysql_conf_path}/ + +%post +echo "================================================" +echo "MySQL install location: /usr/local/mysql" +echo "MySQL configuration file location: /etc/my.cnf" + +%clean +rm -rf \$RPM_BUILD_ROOT +rm -rf \$RPM_BUILD_DIR/* + +%files +%defattr(-, %{MYSQL_USER}, %{MYSQL_GROUP}) +%attr(755, %{MYSQL_USER}, %{MYSQL_GROUP}) %{mysql_server_path}/* +%attr(755, %{MYSQL_USER}, %{MYSQL_GROUP}) %{mysql_conf_path}/my.cnf +%dir %{mysql_server_path}/ +%{mysql_server_path}/ +%{mysql_conf_path}/my.cnf + +%pre +if ! id %{MYSQL_USER} > /dev/null 2>&1;then +useradd -M -s /sbin/nologin %{MYSQL_USER} +fi +if [ -f %{_sysconfdir}/my.cnf ];then +mv %{_sysconfdir}/my.cnf %{_sysconfdir}/my.cnf.rpmold +fi + +%preun +if [ -f %{_initddir}/mysql ];then +mv %{_initddir}/mysql %{_initddir}/mysql.rpmold +fi + +%postun +rm -rf %{prefix} +userdel -r %{MYSQL_USER} >/dev/null 2>&1 +%changelog +* Tues Sep 26 2023 caiyuanhuan - 1.0.0.0 +- First script for openEuler-22.03 KunpengBoostKit-MySQL package