diff --git a/atune.spec b/atune.spec index 65b547600296649569065906e94b1cb08d27e881..3e567b61c9c30e11b4d6ff452f7baae933d6a0e8 100755 --- a/atune.spec +++ b/atune.spec @@ -3,7 +3,7 @@ Summary: AI auto tuning system Name: atune Version: 1.2.0 -Release: 5 +Release: 6 License: MulanPSL-2.0 URL: https://gitee.com/openeuler/A-Tune Source: https://gitee.com/openeuler/A-Tune/repository/archive/v%{version}.tar.gz @@ -17,6 +17,7 @@ Patch9005: adapt-sqlite-3.42-to-resolve-the-build-failure.patch Patch9006: 0001-fix-skopt.Optimizer-incompatible-with-numpy-1.24.patch Patch9007: 0002-scikit-learn-1.2-rename-the-parameter-base_estimator.patch Patch9008: 0003-fix-atune-adm-analyse-failed-problem.patch +Patch9009: backport-copilot-tune-codes.patch BuildRequires: rpm-build golang-bin procps-ng BuildRequires: sqlite >= 3.24.0 openssl @@ -24,6 +25,10 @@ BuildRequires: python3-scikit-optimize python3-pandas python3-xgboost BuildRequires: python3-pyyaml BuildRequires: systemd BuildRequires: perf +BuildRequires: python3-devel +BuildRequires: python3-setuptools +BuildRequires: python3-wheel +BuildRequires: python3-pip Requires: systemd Requires: atune-client = %{version}-%{release} Requires: atune-db = %{version}-%{release} @@ -81,6 +86,14 @@ Conflicts: atune < 1.1.0 %description rest atune restful api for manage atuned AI tuning system. +%package -n copilot-tune +Summary: Copilot Tuning components for A-Tune +License: MuLan PSL v2 +Requires: python3 + +%description -n copilot-tune +Copilot Tune: Python helpers, configs and services integrated into A-Tune. + %prep %setup -n A-Tune-v%{version} %ifarch sw_64 @@ -96,6 +109,7 @@ atune restful api for manage atuned AI tuning system. %patch9006 -p1 %patch9007 -p1 %patch9008 -p1 +%patch9009 -p1 %build %make_build @@ -103,8 +117,18 @@ atune restful api for manage atuned AI tuning system. %install %make_install +cd "%{_builddir}/A-Tune-v%{version}/copilot-tune" +%{__python3} setup.py install -O1 --root %{buildroot} --prefix %{_prefix} + %check +%pre -n copilot-tune +[ -d /etc/copilot-tune ] || mkdir -p /etc/copilot-tune +[ -d /etc/copilot-tune/knowledge_base ] || mkdir -p /etc/copilot-tune/knowledge_base +[ -d /etc/copilot-tune/config ] || mkdir -p /etc/copilot-tune/config +[ -d /etc/copilot-tune/scripts ] || mkdir -p /etc/copilot-tune/scripts +exit 0 + %post %systemd_post atuned.service @@ -132,6 +156,18 @@ atune restful api for manage atuned AI tuning system. %postun rest %systemd_postun_with_restart atune-rest.service +%post -n copilot-tune +%systemd_post tune-mcpserver.service +%systemd_post tune-openapi.service + +%preun -n copilot-tune +%systemd_preun tune-mcpserver.service +%systemd_preun tune-openapi.service + +%postun -n copilot-tune +%systemd_postun_with_restart tune-mcpserver.service +%systemd_postun_with_restart tune-openapi.service + %files %license License/LICENSE %defattr(0640,root,root,0750) @@ -195,7 +231,29 @@ atune restful api for manage atuned AI tuning system. %exclude /etc/atuned/engine_certs/* %exclude /etc/atuned/rest_certs +%files -n copilot-tune +%defattr(0640,root,root,0750) +%attr(0550,root,root) %{python3_sitelib}/src +%exclude %{python3_sitelib}/src/__pycache__ +%attr(0550,root,root) %{python3_sitelib}/copilot_tune-*-py*.egg-info/ +%attr(0550,root,root) %{_bindir}/tune-openapi +%attr(0550,root,root) %{_bindir}/tune-mcpserver +%attr(0550,root,root) %{_bindir}/copilot-tune +%attr(0640,root,root) %{_unitdir}/tune-mcpserver.service +%attr(0640,root,root) %{_unitdir}/tune-openapi.service +%attr(0750,root,root) %dir /etc/copilot-tune +%attr(0750,root,root) %dir /etc/copilot-tune/config +%attr(0750,root,root) %dir /etc/copilot-tune/scripts +%attr(0750,root,root) %dir /etc/copilot-tune/knowledge_base +%attr(0640,root,root) /etc/copilot-tune/config/* +%attr(0640,root,root) /etc/copilot-tune/scripts/* +%config(noreplace) %attr(0640,root,root) /etc/copilot-tune/config/.env.yaml +%attr(0640,root,root) /etc/copilot-tune/knowledge_base/* + %changelog +* Thu Sep 04 2025 jinsaihang - 1.2.0-6 +- submit copilot tune codes + * Mon Apr 21 2025 caixiaomeng - 1.2.0-5 - fix atune-adm analyse failed problem diff --git a/backport-copilot-tune-codes.patch b/backport-copilot-tune-codes.patch new file mode 100644 index 0000000000000000000000000000000000000000..e5b7a079ede4feb8b9c670a48a730697bf91b356 --- /dev/null +++ b/backport-copilot-tune-codes.patch @@ -0,0 +1,17963 @@ +From 40add988887f82a49630683fcf7fca0ee5cbc3ca Mon Sep 17 00:00:00 2001 +From: jinsaihang +Date: Mon, 8 Sep 2025 03:47:31 -0400 +Subject: [PATCH] copilot tune codes + +--- + copilot-tune/FAQ.md | 230 ++ + copilot-tune/LICENSE | 127 ++ + copilot-tune/README.md | 199 ++ + copilot-tune/__init__.py | 0 + copilot-tune/config/.env.yaml | 72 + + copilot-tune/config/app_config.yaml | 81 + + copilot-tune/config/knob_rag_config.json | 4 + + copilot-tune/config/openapi.yaml | 69 + + copilot-tune/config/optimize_config.yaml | 4 + + copilot-tune/doc/zh/README.md | 199 ++ + copilot-tune/requirements.txt | 20 + + copilot-tune/scripts/ceph/benchmark.sh | 2 + + copilot-tune/scripts/ceph/parse_benchmark.sh | 7 + + copilot-tune/scripts/flink/benchmark.sh | 4 + + copilot-tune/scripts/flink/parse_benchmark.sh | 34 + + copilot-tune/scripts/gaussdb/benchmark.sh | 2 + + copilot-tune/scripts/gaussdb/parse_benmark.sh | 2 + + copilot-tune/scripts/mysql/benchmark.sh | 4 + + copilot-tune/scripts/mysql/parse_benchmark.sh | 6 + + copilot-tune/scripts/nginx/benchmark.sh | 5 + + copilot-tune/scripts/nginx/parse_benchmark.sh | 12 + + copilot-tune/scripts/postgresql/benchmark.sh | 4 + + .../scripts/postgresql/parse_benchmark.sh | 6 + + copilot-tune/scripts/redis/benchmark.sh | 17 + + copilot-tune/scripts/redis/parse_benchmark.sh | 6 + + copilot-tune/scripts/spark/benchmark.sh | 3 + + copilot-tune/scripts/spark/parse_benchmark.sh | 14 + + copilot-tune/service/tune-mcpserver.service | 15 + + copilot-tune/service/tune-openapi.service | 15 + + copilot-tune/setup.py | 68 + + copilot-tune/src/__init__.py | 11 + + copilot-tune/src/config.py | 32 + + copilot-tune/src/knowledge_base/README.md | 1 + + .../src/knowledge_base/knob_params/ceph.json | 641 ++++++ + .../src/knowledge_base/knob_params/flink.json | 300 +++ + .../knowledge_base/knob_params/gaussdb.json | 382 ++++ + .../src/knowledge_base/knob_params/mysql.json | 621 ++++++ + .../src/knowledge_base/knob_params/nginx.json | 420 ++++ + .../src/knowledge_base/knob_params/pgsql.json | 448 ++++ + .../src/knowledge_base/knob_params/redis.json | 332 +++ + .../src/knowledge_base/knob_params/spark.json | 199 ++ + .../knowledge_base/knob_params/system.json | 1395 ++++++++++++ + .../src/knowledge_base/optimize/README.md | 1 + + .../optimize/parameter/mysql.jsonl | 22 + + .../optimize/parameter/spark.jsonl | 9 + + .../optimize/parameter/system.jsonl | 39 + + .../optimize/strategy/system.jsonl | 302 +++ + .../knowledge_base/params/mysql_params.json | 849 ++++++++ + .../knowledge_base/params/system_params.json | 1935 +++++++++++++++++ + .../src/performance_analyzer/__init__.py | 0 + .../src/performance_analyzer/app_analyzer.py | 77 + + .../src/performance_analyzer/base_analyzer.py | 38 + + .../src/performance_analyzer/cpu_analyzer.py | 176 ++ + .../src/performance_analyzer/disk_analyzer.py | 109 + + .../performance_analyzer/memory_analyzer.py | 72 + + .../micro_dep_analyzer.py | 95 + + .../performance_analyzer/network_analyzer.py | 103 + + .../performance_analyzer.py | 111 + + .../src/performance_collector/__init__.py | 0 + .../performance_collector/app_collector.py | 45 + + .../application/ceph_collector.py | 165 ++ + .../application/flink_collector.py | 269 +++ + .../application/gaussdb_collector.py | 206 ++ + .../application/mysql_collector.py | 145 ++ + .../application/nginx_collector.py | 88 + + .../application/pgsql_collector.py | 128 ++ + .../application/redis_collector.py | 107 + + .../application/spark_collector.py | 146 ++ + .../performance_collector/base_collector.py | 63 + + .../performance_collector/cpu_collector.py | 304 +++ + .../performance_collector/disk_collector.py | 117 + + .../performance_collector/memory_collector.py | 151 ++ + .../performance_collector/metric_collector.py | 81 + + .../micro_dep_collector.py | 453 ++++ + .../network_collector.py | 139 ++ + .../static_metric_profile_collector.py | 52 + + .../static_profile_collector.py | 246 +++ + .../src/performance_optimizer/__init__.py | 0 + .../performance_optimizer/base_optimizer.py | 185 ++ + .../performance_optimizer/knob_optimizer.py | 132 ++ + .../performance_optimizer/param_knowledge.py | 117 + + .../performance_optimizer/param_optimizer.py | 215 ++ + .../param_recommender.py | 177 ++ + .../performance_optimizer/set_knob_cmd.jsonl | 64 + + .../strategy_optimizer.py | 221 ++ + copilot-tune/src/performance_test/__init__.py | 0 + .../src/performance_test/pressure_test.py | 60 + + copilot-tune/src/start_mcpserver.py | 228 ++ + copilot-tune/src/start_tune.py | 167 ++ + copilot-tune/src/start_workflow.py | 193 ++ + .../src/tests/manager/task_manager.py | 47 + + .../src/tests/manager/test_trigger_signal.py | 34 + + copilot-tune/src/tests/mock_ssh_client.py | 17 + + .../tests/test_perf_optim/param_knowledge.py | 10 + + .../test_utils/collector/collector_trigger.py | 30 + + .../test_utils/collector/metric_collector.py | 66 + + .../collector/test_cpu_collector.py | 50 + + .../test_utils/collector/test_io_collector.py | 50 + + .../src/tests/test_utils/config/app_config.py | 23 + + .../tests/test_utils/test_pressure_test.py | 43 + + .../src/tests/test_utils/thread_pool.py | 29 + + copilot-tune/src/utils/README.md | 169 ++ + copilot-tune/src/utils/__init__.py | 0 + copilot-tune/src/utils/collector/__init__.py | 0 + .../src/utils/collector/collector_trigger.py | 202 ++ + .../src/utils/collector/metric_collector.py | 229 ++ + copilot-tune/src/utils/common.py | 95 + + copilot-tune/src/utils/config/__init__.py | 0 + copilot-tune/src/utils/config/app_config.py | 305 +++ + .../src/utils/config/global_config.py | 93 + + copilot-tune/src/utils/constant.py | 6 + + copilot-tune/src/utils/json_repair.py | 15 + + copilot-tune/src/utils/llm.py | 33 + + copilot-tune/src/utils/manager/__init__.py | 0 + .../src/utils/manager/task_manager.py | 185 ++ + copilot-tune/src/utils/metrics.py | 16 + + copilot-tune/src/utils/rag/__init__.py | 0 + copilot-tune/src/utils/rag/knob_rag.py | 208 ++ + copilot-tune/src/utils/shell_execute.py | 179 ++ + copilot-tune/src/utils/thread_pool.py | 236 ++ + 120 files changed, 16985 insertions(+) + create mode 100644 copilot-tune/FAQ.md + create mode 100644 copilot-tune/LICENSE + create mode 100644 copilot-tune/README.md + create mode 100644 copilot-tune/__init__.py + create mode 100644 copilot-tune/config/.env.yaml + create mode 100644 copilot-tune/config/app_config.yaml + create mode 100644 copilot-tune/config/knob_rag_config.json + create mode 100644 copilot-tune/config/openapi.yaml + create mode 100644 copilot-tune/config/optimize_config.yaml + create mode 100644 copilot-tune/doc/zh/README.md + create mode 100644 copilot-tune/requirements.txt + create mode 100644 copilot-tune/scripts/ceph/benchmark.sh + create mode 100644 copilot-tune/scripts/ceph/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/flink/benchmark.sh + create mode 100644 copilot-tune/scripts/flink/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/gaussdb/benchmark.sh + create mode 100644 copilot-tune/scripts/gaussdb/parse_benmark.sh + create mode 100644 copilot-tune/scripts/mysql/benchmark.sh + create mode 100644 copilot-tune/scripts/mysql/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/nginx/benchmark.sh + create mode 100644 copilot-tune/scripts/nginx/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/postgresql/benchmark.sh + create mode 100644 copilot-tune/scripts/postgresql/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/redis/benchmark.sh + create mode 100644 copilot-tune/scripts/redis/parse_benchmark.sh + create mode 100644 copilot-tune/scripts/spark/benchmark.sh + create mode 100644 copilot-tune/scripts/spark/parse_benchmark.sh + create mode 100644 copilot-tune/service/tune-mcpserver.service + create mode 100644 copilot-tune/service/tune-openapi.service + create mode 100644 copilot-tune/setup.py + create mode 100644 copilot-tune/src/__init__.py + create mode 100644 copilot-tune/src/config.py + create mode 100644 copilot-tune/src/knowledge_base/README.md + create mode 100644 copilot-tune/src/knowledge_base/knob_params/ceph.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/flink.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/gaussdb.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/mysql.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/nginx.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/pgsql.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/redis.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/spark.json + create mode 100644 copilot-tune/src/knowledge_base/knob_params/system.json + create mode 100644 copilot-tune/src/knowledge_base/optimize/README.md + create mode 100644 copilot-tune/src/knowledge_base/optimize/parameter/mysql.jsonl + create mode 100644 copilot-tune/src/knowledge_base/optimize/parameter/spark.jsonl + create mode 100644 copilot-tune/src/knowledge_base/optimize/parameter/system.jsonl + create mode 100644 copilot-tune/src/knowledge_base/optimize/strategy/system.jsonl + create mode 100644 copilot-tune/src/knowledge_base/params/mysql_params.json + create mode 100644 copilot-tune/src/knowledge_base/params/system_params.json + create mode 100644 copilot-tune/src/performance_analyzer/__init__.py + create mode 100644 copilot-tune/src/performance_analyzer/app_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/base_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/cpu_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/disk_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/memory_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/micro_dep_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/network_analyzer.py + create mode 100644 copilot-tune/src/performance_analyzer/performance_analyzer.py + create mode 100644 copilot-tune/src/performance_collector/__init__.py + create mode 100644 copilot-tune/src/performance_collector/app_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/ceph_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/flink_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/gaussdb_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/mysql_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/nginx_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/pgsql_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/redis_collector.py + create mode 100644 copilot-tune/src/performance_collector/application/spark_collector.py + create mode 100644 copilot-tune/src/performance_collector/base_collector.py + create mode 100644 copilot-tune/src/performance_collector/cpu_collector.py + create mode 100644 copilot-tune/src/performance_collector/disk_collector.py + create mode 100644 copilot-tune/src/performance_collector/memory_collector.py + create mode 100644 copilot-tune/src/performance_collector/metric_collector.py + create mode 100644 copilot-tune/src/performance_collector/micro_dep_collector.py + create mode 100644 copilot-tune/src/performance_collector/network_collector.py + create mode 100644 copilot-tune/src/performance_collector/static_metric_profile_collector.py + create mode 100644 copilot-tune/src/performance_collector/static_profile_collector.py + create mode 100644 copilot-tune/src/performance_optimizer/__init__.py + create mode 100644 copilot-tune/src/performance_optimizer/base_optimizer.py + create mode 100644 copilot-tune/src/performance_optimizer/knob_optimizer.py + create mode 100644 copilot-tune/src/performance_optimizer/param_knowledge.py + create mode 100644 copilot-tune/src/performance_optimizer/param_optimizer.py + create mode 100644 copilot-tune/src/performance_optimizer/param_recommender.py + create mode 100644 copilot-tune/src/performance_optimizer/set_knob_cmd.jsonl + create mode 100644 copilot-tune/src/performance_optimizer/strategy_optimizer.py + create mode 100644 copilot-tune/src/performance_test/__init__.py + create mode 100644 copilot-tune/src/performance_test/pressure_test.py + create mode 100644 copilot-tune/src/start_mcpserver.py + create mode 100644 copilot-tune/src/start_tune.py + create mode 100644 copilot-tune/src/start_workflow.py + create mode 100644 copilot-tune/src/tests/manager/task_manager.py + create mode 100644 copilot-tune/src/tests/manager/test_trigger_signal.py + create mode 100644 copilot-tune/src/tests/mock_ssh_client.py + create mode 100644 copilot-tune/src/tests/test_perf_optim/param_knowledge.py + create mode 100644 copilot-tune/src/tests/test_utils/collector/collector_trigger.py + create mode 100644 copilot-tune/src/tests/test_utils/collector/metric_collector.py + create mode 100644 copilot-tune/src/tests/test_utils/collector/test_cpu_collector.py + create mode 100644 copilot-tune/src/tests/test_utils/collector/test_io_collector.py + create mode 100644 copilot-tune/src/tests/test_utils/config/app_config.py + create mode 100644 copilot-tune/src/tests/test_utils/test_pressure_test.py + create mode 100644 copilot-tune/src/tests/test_utils/thread_pool.py + create mode 100644 copilot-tune/src/utils/README.md + create mode 100644 copilot-tune/src/utils/__init__.py + create mode 100644 copilot-tune/src/utils/collector/__init__.py + create mode 100644 copilot-tune/src/utils/collector/collector_trigger.py + create mode 100644 copilot-tune/src/utils/collector/metric_collector.py + create mode 100644 copilot-tune/src/utils/common.py + create mode 100644 copilot-tune/src/utils/config/__init__.py + create mode 100644 copilot-tune/src/utils/config/app_config.py + create mode 100644 copilot-tune/src/utils/config/global_config.py + create mode 100644 copilot-tune/src/utils/constant.py + create mode 100644 copilot-tune/src/utils/json_repair.py + create mode 100644 copilot-tune/src/utils/llm.py + create mode 100644 copilot-tune/src/utils/manager/__init__.py + create mode 100644 copilot-tune/src/utils/manager/task_manager.py + create mode 100644 copilot-tune/src/utils/metrics.py + create mode 100644 copilot-tune/src/utils/rag/__init__.py + create mode 100644 copilot-tune/src/utils/rag/knob_rag.py + create mode 100644 copilot-tune/src/utils/shell_execute.py + create mode 100644 copilot-tune/src/utils/thread_pool.py + +diff --git a/copilot-tune/FAQ.md b/copilot-tune/FAQ.md +new file mode 100644 +index 0000000..d9310b9 +--- /dev/null ++++ b/copilot-tune/FAQ.md +@@ -0,0 +1,230 @@ ++1. 运行时,报错 `TypeError: Client.__init__() got an unexpected keyword argument 'proxies'` ++ ++错误栈: ++``` ++Traceback (most recent call last): ++ File "/root/workspace/eulercopilot/A-Tune/src/testmain.py", line 10, in ++ testCollector = MetricCollector( ++ ^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/performance_collector/metric_collector.py", line 47, in __init__ ++ cmd = get_mysql_cmd( ++ ^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/performance_collector/mysql_collector.py", line 112, in get_mysql_cmd ++ is_mysql_running = check_mysql_state(host_ip=host_ip, host_port=host_port, host_user=host_user, host_password=host_password) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/performance_collector/mysql_collector.py", line 63, in check_mysql_state ++ mysql_state = get_llm_response(prompt=check_prompt.format(mysql_state=res[check_mysql_state_cmd])) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/llm.py", line 8, in get_llm_response ++ client = ChatOpenAI( ++ ^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/langchain_core/load/serializable.py", line 130, in __init__ ++ super().__init__(*args, **kwargs) ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/pydantic/main.py", line 193, in __init__ ++ self.__pydantic_validator__.validate_python(data, self_instance=self) ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/langchain_openai/chat_models/base.py", line 551, in validate_environment ++ self.root_client = openai.OpenAI(**client_params, **sync_specific) # type: ignore[arg-type] ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/openai/_client.py", line 123, in __init__ ++ super().__init__( ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/openai/_base_client.py", line 856, in __init__ ++ self._client = http_client or SyncHttpxClientWrapper( ++ ^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/openai/_base_client.py", line 754, in __init__ ++ super().__init__(**kwargs) ++TypeError: Client.__init__() got an unexpected keyword argument 'proxies' ++``` ++原因:连接 llm server 时,被 proxy 拦截。 ++ ++解决方法:设置 no_proxy 变量,防止被 proxy 拦截。 ++```BASH ++export no_proxy=9.82.33.59,$no_proxy ++``` ++ ++解决后测试: ++```BASH ++curl --location 'http://9.82.33.59:11434/v1/chat/completions' \ ++--header 'Content-Type: application/json' \ ++--data '{ ++ "model": "qwen2:72b", ++ "messages": [ ++ {"role": "user", "content": "如何根据进程号采集cpu占用率"} ++ ], ++ "stream": true ++ }' ++``` ++ ++预期输出类似如下,则说明修改成功: ++``` ++data: {"id":"chatcmpl-603","object":"chat.completion.chunk","created":1747040993,"model":"qwen2:72b","system_fingerprint":"fp_ollama","choices":[{"index":0,"delta":{"role":"assistant","content":"在"},"finish_reason":null}]} ++ ++data: {"id":"chatcmpl-603","object":"chat.completion.chunk","created":1747040993,"model":"qwen2:72b","system_fingerprint":"fp_ollama","choices":[{"index":0,"delta":{"role":"assistant","content":"Linux"},"finish_reason":null}]} ++ ++data: {"id":"chatcmpl-603","object":"chat.completion.chunk","created":1747040993,"model":"qwen2:72b","system_fingerprint":"fp_ollama","choices":[{"index":0,"delta":{"role":"assistant","content":"系统"},"finish_reason":null}]} ++``` ++ ++2. 运行时,报错 `ValueError: Found array with 0 feature(s) (shape=(39, 0)) while a minimum of 1 is required by the normalize function.` ++ ++错误栈: ++``` ++Building index for system.jsonl...: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [00:19<00:00, 1.99it/s] ++Traceback (most recent call last): ++ File "/root/workspace/eulercopilot/A-Tune/src/testmain.py", line 36, in ++ plan, isfinish, feedback = testKnob.run() ++ ^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/performance_optimizer/base_optimizer.py", line 155, in run ++ is_execute, optimization_plan = self.think(history=record) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/performance_optimizer/knob_optimizer.py", line 56, in think ++ knobs = rag.run() ++ ^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/utils/rag/knob_rag.py", line 154, in run ++ system_index, system_docs = self.build_index("system") ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/src/utils/rag/knob_rag.py", line 109, in build_index ++ normalized_embeddings = normalize(np.array(embeddings).astype('float32')) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper ++ return func(*args, **kwargs) ++ ^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/sklearn/preprocessing/_data.py", line 1933, in normalize ++ X = check_array( ++ ^^^^^^^^^^^^ ++ File "/root/workspace/eulercopilot/A-Tune/venv/lib64/python3.11/site-packages/sklearn/utils/validation.py", line 1096, in check_array ++ raise ValueError( ++ValueError: Found array with 0 feature(s) (shape=(39, 0)) while a minimum of 1 is required by the normalize function. ++``` ++原因:embedding 接口不匹配。 ++ ++解决方法:更新 REMOTE_EMBEDDING_ENDPOINT 配置为匹配的 embedding 接口 url。 ++ ++3. openai客户端连接服务器时ssl证书校验失败 ++ ++错误栈: ++``` ++2025-09-01 15:05:56 - INFO [_base_client.py:_retry_request:1086] - Retrying request to /chat/completions in 0.995184 seconds ++Traceback (most recent call last): ++ File "/usr/local/lib/python3.11/site-packages/httpx/_transports/default.py", line 72, in map_httpcore_exceptions ++ yield ++ File "/usr/local/lib/python3.11/site-packages/httpx/_transports/default.py", line 236, in handle_request ++ resp = self._pool.handle_request(req) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py", line 256, in handle_request ++ raise exc from None ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_sync/connection_pool.py", line 236, in handle_request ++ response = connection.handle_request( ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_sync/http_proxy.py", line 316, in handle_request ++ stream = stream.start_tls(**kwargs) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_sync/http11.py", line 376, in start_tls ++ return self._stream.start_tls(ssl_context, server_hostname, timeout) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_backends/sync.py", line 154, in start_tls ++ with map_exceptions(exc_map): ++ File "/usr/lib64/python3.11/contextlib.py", line 155, in __exit__ ++ self.gen.throw(typ, value, traceback) ++ File "/usr/local/lib/python3.11/site-packages/httpcore/_exceptions.py", line 14, in map_exceptions ++ raise to_exc(exc) from exc ++httpcore.ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1006) ++ ++The above exception was the direct cause of the following exception: ++ ++Traceback (most recent call last): ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 990, in _request ++ response = self._client.send( ++ ^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpx/_client.py", line 926, in send ++ response = self._send_handling_auth( ++ ^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpx/_client.py", line 954, in _send_handling_auth ++ response = self._send_handling_redirects( ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpx/_client.py", line 991, in _send_handling_redirects ++ response = self._send_single_request(request) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpx/_client.py", line 1027, in _send_single_request ++ response = transport.handle_request(request) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/httpx/_transports/default.py", line 235, in handle_request ++ with map_httpcore_exceptions(): ++ File "/usr/lib64/python3.11/contextlib.py", line 155, in __exit__ ++ self.gen.throw(typ, value, traceback) ++ File "/usr/local/lib/python3.11/site-packages/httpx/_transports/default.py", line 89, in map_httpcore_exceptions ++ raise mapped_exc(message) from exc ++httpx.ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1006) ++ ++The above exception was the direct cause of the following exception: ++ ++Traceback (most recent call last): ++ File "/root/workspace/copilot/A-Tune-copilot/src/utils/llm.py", line 35, in ++ res = get_llm_response("介绍一下中国") ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/root/workspace/copilot/A-Tune-copilot/src/utils/llm.py", line 19, in get_llm_response ++ result = client.invoke(input=prompt) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py", line 307, in invoke ++ self.generate_prompt( ++ File "/usr/local/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py", line 843, in generate_prompt ++ return self.generate(prompt_messages, stop=stop, callbacks=callbacks, **kwargs) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py", line 683, in generate ++ self._generate_with_cache( ++ File "/usr/local/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py", line 908, in _generate_with_cache ++ result = self._generate( ++ ^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/langchain_openai/chat_models/base.py", line 689, in _generate ++ return generate_from_stream(stream_iter) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/langchain_core/language_models/chat_models.py", line 91, in generate_from_stream ++ generation = next(stream, None) ++ ^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/langchain_openai/chat_models/base.py", line 656, in _stream ++ response = self.client.create(**payload) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_utils/_utils.py", line 274, in wrapper ++ return func(*args, **kwargs) ++ ^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/resources/chat/completions.py", line 815, in create ++ return self._post( ++ ^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1277, in post ++ return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)) ++ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 954, in request ++ return self._request( ++ ^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1014, in _request ++ return self._retry_request( ++ ^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1092, in _retry_request ++ return self._request( ++ ^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1014, in _request ++ return self._retry_request( ++ ^^^^^^^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1092, in _retry_request ++ return self._request( ++ ^^^^^^^^^^^^^^ ++ File "/usr/local/lib/python3.11/site-packages/openai/_base_client.py", line 1024, in _request ++ raise APIConnectionError(request=request) from err ++openai.APIConnectionError: Connection error. ++``` ++ ++解决办法:修改 /usr/local/lib/python3.11/site-packages/httpx/_client.py ,在 httpx.Client 创建时改为默认禁用ssl校验 verify=False: ++ ++```python ++class Client(BaseClient): ++ def __init__( ++ self, ++ *, ++ auth: AuthTypes | None = None, ++ params: QueryParamTypes | None = None, ++ headers: HeaderTypes | None = None, ++ cookies: CookieTypes | None = None, ++ # verify: VerifyTypes = True, ++ verify: VerifyTypes = False, # 禁用ssl校验 ++ cert: CertTypes | None = None, ++ http1: bool = True, ++ http2: bool = False, ++``` +\ No newline at end of file +diff --git a/copilot-tune/LICENSE b/copilot-tune/LICENSE +new file mode 100644 +index 0000000..a589e86 +--- /dev/null ++++ b/copilot-tune/LICENSE +@@ -0,0 +1,127 @@ ++ 木兰宽松许可证, 第2版 ++ ++ 木兰宽松许可证, 第2版 ++ 2020年1月 http://license.coscl.org.cn/MulanPSL2 ++ ++ ++ 您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: ++ ++ 0. 定义 ++ ++ “软件”是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 ++ ++ “贡献”是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 ++ ++ “贡献者”是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 ++ ++ “法人实体”是指提交贡献的机构及其“关联实体”。 ++ ++ “关联实体”是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 ++ ++ 1. 授予版权许可 ++ ++ 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 ++ ++ 2. 授予专利许可 ++ ++ 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 ++ ++ 3. 无商标许可 ++ ++ “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 ++ ++ 4. 分发限制 ++ ++ 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 ++ ++ 5. 免责声明与责任限制 ++ ++ “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 ++ ++ 6. 语言 ++ “本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 ++ ++ 条款结束 ++ ++ 如何将木兰宽松许可证,第2版,应用到您的软件 ++ ++ 如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: ++ ++ 1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; ++ ++ 2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; ++ ++ 3, 请将如下声明文本放入每个源文件的头部注释中。 ++ ++ Copyright (c) [Year] [name of copyright holder] ++ [Software Name] is licensed under Mulan PSL v2. ++ You can use this software according to the terms and conditions of the Mulan PSL v2. ++ You may obtain a copy of Mulan PSL v2 at: ++ http://license.coscl.org.cn/MulanPSL2 ++ THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. ++ See the Mulan PSL v2 for more details. ++ ++ ++ Mulan Permissive Software License,Version 2 ++ ++ Mulan Permissive Software License,Version 2 (Mulan PSL v2) ++ January 2020 http://license.coscl.org.cn/MulanPSL2 ++ ++ Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: ++ ++ 0. Definition ++ ++ Software means the program and related documents which are licensed under this License and comprise all Contribution(s). ++ ++ Contribution means the copyrightable work licensed by a particular Contributor under this License. ++ ++ Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. ++ ++ Legal Entity means the entity making a Contribution and all its Affiliates. ++ ++ Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. ++ ++ 1. Grant of Copyright License ++ ++ Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. ++ ++ 2. Grant of Patent License ++ ++ Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. ++ ++ 3. No Trademark License ++ ++ No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4. ++ ++ 4. Distribution Restriction ++ ++ You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. ++ ++ 5. Disclaimer of Warranty and Limitation of Liability ++ ++ THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. ++ ++ 6. Language ++ ++ THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. ++ ++ END OF THE TERMS AND CONDITIONS ++ ++ How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software ++ ++ To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: ++ ++ i Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; ++ ++ ii Create a file named “LICENSE” which contains the whole context of this License in the first directory of your software package; ++ ++ iii Attach the statement to the appropriate annotated syntax at the beginning of each source file. ++ ++ ++ Copyright (c) [Year] [name of copyright holder] ++ [Software Name] is licensed under Mulan PSL v2. ++ You can use this software according to the terms and conditions of the Mulan PSL v2. ++ You may obtain a copy of Mulan PSL v2 at: ++ http://license.coscl.org.cn/MulanPSL2 ++ THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. ++ See the Mulan PSL v2 for more details. +diff --git a/copilot-tune/README.md b/copilot-tune/README.md +new file mode 100644 +index 0000000..5823fad +--- /dev/null ++++ b/copilot-tune/README.md +@@ -0,0 +1,199 @@ ++# EulerCopilot Tune安装使用指南 ++ ++### 项目简介 ++EulerCopilot Tune通过采集系统、微架构、应用等维度的指标数据,结合大模型和定制化的prompt工程,针对不同应用的可调参数给出可靠的参数推荐,同时根据推荐的参数运行benchmark,与baseline做对比并计算出推荐参数对应用性能的提升值。 ++ ++### 软件架构 ++软件架构说明 ++ ++### 安装教程 ++ ++1. 下载gitee源码,gitee代码仓地址: ++https://gitee.com/openeuler/A-Tune/tree/euler-copilot-tune/ ++(注意:分支指定为euler-copilot-tune) ++2. 安装其他依赖 ++```bash ++#1.调优程序运行机器安装python venv依赖 ++yum install python3-devel krb5-devel ++#2.目标应用所在机器安装调优依赖并重启sysstat ++yum install sysstat perf ++systemctl start sysstat ++``` ++3. 调优程序运行机器安装python依赖: ++```BASH ++#1.创建并加载python venv ++python3 -m venv venv ++source venv/bin/activate ++ ++#2.安装python依赖包 ++pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ++``` ++ ++### 使用指南 ++ ++1. 准备env yaml,放入项目的config/.env.yaml中,具体格式如下: ++```YAML ++LLM_KEY: "YOUR_LLM_KEY" ++LLM_URL: "YOUR_LLM_URL" ++LLM_MODEL_NAME: "YOUR_LLM_MODEL_NAME" ++LLM_MAX_TOKENS: ++ ++REMOTE_EMBEDDING_ENDPOINT: "YOUR_EMBEDDING_MODEL_URL" ++REMOTE_EMBEDDING_MODEL_NAME: "YOUR_MODEL_NAME" ++ ++servers: ++ - ip: "" #应用所在ip ++ host_user: "" #登录机器的usr id ++ password: "" #登录机器的密码 ++ port: #应用所在ip的具体port ++ app: "mysql" #当前支持mysql、nginx、pgsql、spark ++ target_process_name: "mysqld" #调优应用的name ++ business_context: "高并发数据库服务,CPU负载主要集中在用户态处理" #调优应用的描述(用于策略生成) ++ max_retries: 3 ++ delay: 1.0 ++ ++feature: ++ - need_restart_application: False #修改参数之后是否需要重启应用使参数生效 ++ need_recover_cluster: False #调优过程中是否需要恢复集群 ++ microDep_collector: True #是否开启微架构指标踩采集 ++ pressure_test_mode: True #是否通过压测模拟负载环境 ++ tune_system_param: False #是否调整系统参数 ++ tune_app_param: True #是否调整应用参数 ++ strategy_optimization: False #是否需要策略推荐 ++ benchmark_timeout: 3600 #benchmark执行超时限制 ++``` ++ ++2. 完善app_config.yaml,放入项目的config/app_config.yaml中(重点是补充set_param_template、get_param_template、benchmark脚本),具体内容如下: ++```YAML ++mysql: ++ user: "root" ++ password: "123456" ++ config_file: "/etc/my.cnf" ++ port: 3306 ++ set_param_template: 'grep -q "^$param_name\\s*=" "$config_file" && sed -i "s/^$param_name\\s*=.*/$param_name = $param_value/" "$config_file" || sed -i "/\\[mysqld\\]/a $param_name = $param_value" "$config_file"' ++ get_param_template: 'grep -E "^$param_name\s*=" $config_file | cut -d= -f2- | xargs' ++ stop_workload: "systemctl stop mysqld" ++ start_workload: "systemctl start mysqld" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/mysql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++flink: ++ set_param_template: 'sh /home/wsy/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /home/wsy/get_param.sh $param_name' ++ benchmark: "sh /home/wsy/nexmark_test.sh" ++ stop_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/stop-cluster.sh && /usr/local/nexmark/bin/shutdown_cluster.sh"' ++ start_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/start-cluster.sh"' ++ performance_metric: "THROUGHPUT" ++ ++pgsql: ++ user: "postgres" ++ password: "postgres" ++ config_file: "/data/data1/pgsql/postgresql.conf" ++ port: 5432 ++ set_param_template: 'grep -qE "^\s*$param_name\s*=" "$config_file" && sed -i "s/^[[:space:]]*$param_name[[:space:]]*=.*/$param_name = $param_value/" "$config_file" || echo "$param_name = $param_value" >> "$config_file"' ++ get_param_template: 'grep -oP "^\s*$param_name\s*=\s*\K.*" "$config_file"' ++ stop_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl stop -D /data/data1/pgsql/ -m fast'" ++ start_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl start -D /data/data1/pgsql/ -l /var/log/postgresql/postgresql.log'" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/postgresql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++spark: ++ set_param_template: 'sh /path/of/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /path/of/get_param.sh $param_name' ++ benchmark: "sh /path/of/spark_benchmark.sh" ++ performance_metric: "DURATION" ++ ++nginx: ++ port: 10000 ++ config_file: "/usr/local/nginx/conf/nginx.conf" ++ set_param_template: 'grep -q "^\\s*$param_name\\s\\+" "$config_file" && sed -i "s|^\\s*$param_name\\s\\+.*| $param_name $param_value;|" "$config_file" || sed -i "/http\\s*{/a\ $param_name $param_value;" "$config_file"' ++ get_param_template: 'grep -E "^\\s*$param_name\\s+" $config_file | head -1 | sed -E "s/^\\s*$param_name\\s+(.*);/\\1/"' ++ stop_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ start_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/nginx/parse_benchmark.sh $host_ip $port" ++ performance_metric: "QPS" ++ ++ceph: ++ set_param_template: 'ceph config set osd "$param_name" "$param_value"' ++ get_param_template: 'sh /path/of/get_params.sh' ++ start_workload: "sh /path/of/restart_ceph.sh" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/ceph/parse_benchmark.sh" ++ performance_metric: "BANDWIDTH" ++ ++gaussdb: ++ user: "" ++ password: "" ++ config_file: "/path/of/config_file" ++ port: 5432 ++ set_param_template: 'gs_guc set -Z datanode -N all -I all -c "${param_name}=${param_value}"' ++ get_param_template: 'gs_guc check -Z datanode -N all -I all -c "${param_name}"' ++ stop_workload: "cm_ctl stop -m i" ++ start_workload: "cm_ctl start" ++ recover_workload: "$EXECUTE_MODE:local sh /path/of/gaussdb_cluster_recover.sh" ++ benchmark: "$EXECUTE_MODE:local sh/path/of/gaussdb_benchmark.sh" ++ performance_metric: "DURATION" ++ ++system: ++ set_param_template: 'sysctl -w $param_name=$param_value' ++ get_param_template: 'sysctl $param_name' ++ ++redis: ++ port: 6379 ++ config_file: "/etc/redis.conf" ++ set_param_template: "sed -i 's/^$param_name/$param_name $param_value/g' $config_file" ++ get_param_template: "grep -P '$param_name' $config_file | awk '{print $2}" ++ start_workload: "systemctl start redis" ++ stop_workload: "systemctl stop redis" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/redis/parse_benchmark.sh $host_ip $port " ++ performance_metric: "QPS" ++ ++ ++``` ++其中: ++set_param_template:根据调优结果修改应用参数,用于后续测试效果 ++get_param_template:获取应用参数 ++recover_workload: 恢复集群 ++benchmark:benchmark脚本,格式如下: ++ ++```bash ++#(必须有)用于通知框架可以执行指标采集的标识 ++echo 1 > /tmp/euler-copilot-fifo ++ ++#benchmark具体执行 ++cd /root/spark_auto_deploy_arm/spark_test ++sh tpcds_test_1t_spark331_linearity_2p.sh > /home/cxm/spark_benchmark.log 2>&1 ++ ++#(必须有)计算并输出相应的performance_metric的语句 ++cd /home/cxm ++time_taken=$(grep "time_taken:" "spark_benchmark.log" | sed -E 's/.*time_taken:([0-9.]+)s.*/\1/' | paste -sd+ | bc | xargs printf "%.2f") ++echo $time_taken ++``` ++3. 运行EulerCopilot ++```bash ++export PYTHONPATH="`pwd`:$PYTHONPATH" ++python3 src/start_tune.py ++ ++``` ++#### 服务的方式运行: ++ ++1、安装服务 ++ ++​ 进入项目目录,执行python setup.py install ++ ++2、在/etc/euler-copilot-tune 目录修改配置文件,具体内容参考上面源码部署方式 ++ ++3、启动服务 ++ ++~~~bash ++#命令行执行如下命令 ++#开启调优 ++euler-copilot-tune ++#开启mcpserver 日志通过执行 journalctl -xe -u tune-mcpserver --all -f 查看 ++tune-mcpserver ++~~~ ++ ++​ ++ ++### 常见问题解决 ++ ++见 [FAQ.md](./FAQ.md) +\ No newline at end of file +diff --git a/copilot-tune/__init__.py b/copilot-tune/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/config/.env.yaml b/copilot-tune/config/.env.yaml +new file mode 100644 +index 0000000..d0da340 +--- /dev/null ++++ b/copilot-tune/config/.env.yaml +@@ -0,0 +1,72 @@ ++LLM_KEY: "YOUR_LLM_KEY" ++LLM_URL: "YOUR_LLM_URL" ++LLM_MODEL_NAME: "YOUR_LLM_MODEL_NAME" ++LLM_MAX_TOKENS: ++ ++REMOTE_EMBEDDING_ENDPOINT: "YOUR_EMBEDDING_MODEL_URL" ++REMOTE_EMBEDDING_MODEL_NAME: "YOUR_MODEL_NAME" ++ ++servers: ++ - ip: "9.82.36.53" ++ host_user: "root" ++ password: "Huawei12#$" ++ port: 22 ++ app: "flink" ++ target_process_name: "jobmanager" ++ business_context: "高并发流处理服务,CPU负载主要集中在用户态算子与序列化反序列化" ++ max_retries: 3 ++ delay: 1.0 ++ ++ ++#servers: ++# - ip: "" ++# host_user: "" ++# password: "" ++# port: 22 ++# app: "mysql" ++# target_process_name: "mysqld" ++# max_retries: 3 ++# delay: 1.0 ++ ++#servers: ++# - ip: "" ++# host_user: "" ++# password: "" ++# port: ++# app: "spark" ++# target_process_name: "java" ++# business_context: "基于内存计算的分布式批处理框架,适合大规模数据处理任务,CPU负载主要集中在用户态的序列化、反序列化及任务调度执行过程" ++# max_retries: 3 ++# delay: 1.0 ++ ++# servers: ++# - ip: "" ++# host_user: "" ++# password: "" ++# port: ++# app: "gaussdb" ++# target_process_name: "gaussdb" ++# business_context: "高并发数据库服务,CPU负载主要集中在用户态处理" ++# max_retries: 3 ++# delay: 1.0 ++ ++#servers: ++# - ip: "" ++# host_user: "" ++# password: "" ++# port: ++# app: "nginx" ++# target_process_name: "nginx" ++# business_context: "高并发Web服务,CPU负载主要集中在用户态处理" ++# max_retries: 3 ++# delay: 1.0 ++ ++feature: ++ - need_restart_application: False ++ need_recover_cluster: False ++ microDep_collector: True ++ pressure_test_mode: True ++ tune_system_param: False ++ tune_app_param: True ++ strategy_optimization: False ++ benchmark_timeout: 3600 +diff --git a/copilot-tune/config/app_config.yaml b/copilot-tune/config/app_config.yaml +new file mode 100644 +index 0000000..8b8316c +--- /dev/null ++++ b/copilot-tune/config/app_config.yaml +@@ -0,0 +1,81 @@ ++mysql: ++ user: "root" ++ password: "123456" ++ config_file: "/etc/my.cnf" ++ port: 3306 ++ set_param_template: 'grep -q "^$param_name\\s*=" "$config_file" && sed -i "s/^$param_name\\s*=.*/$param_name = $param_value/" "$config_file" || sed -i "/\\[mysqld\\]/a $param_name = $param_value" "$config_file"' ++ get_param_template: 'grep -E "^$param_name\s*=" $config_file | cut -d= -f2- | xargs' ++ stop_workload: "systemctl stop mysqld" ++ start_workload: "systemctl start mysqld" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/mysql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++flink: ++ set_param_template: 'sh /home/wsy/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /home/wsy/get_param.sh $param_name' ++ benchmark: "sh /home/wsy/nexmark_test.sh" ++ stop_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/stop-cluster.sh && /usr/local/nexmark/bin/shutdown_cluster.sh"' ++ start_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/start-cluster.sh"' ++ performance_metric: "THROUGHPUT" ++ ++pgsql: ++ user: "postgres" ++ password: "postgres" ++ config_file: "/data/data1/pgsql/postgresql.conf" ++ port: 5432 ++ set_param_template: 'grep -qE "^\s*$param_name\s*=" "$config_file" && sed -i "s/^[[:space:]]*$param_name[[:space:]]*=.*/$param_name = $param_value/" "$config_file" || echo "$param_name = $param_value" >> "$config_file"' ++ get_param_template: 'grep -oP "^\s*$param_name\s*=\s*\K.*" "$config_file"' ++ stop_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl stop -D /data/data1/pgsql/ -m fast'" ++ start_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl start -D /data/data1/pgsql/ -l /var/log/postgresql/postgresql.log'" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/postgresql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++spark: ++ set_param_template: 'sh /path/of/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /path/of/get_param.sh $param_name' ++ benchmark: "sh /path/of/spark_benchmark.sh" ++ performance_metric: "DURATION" ++ ++nginx: ++ port: 10000 ++ config_file: "/usr/local/nginx/conf/nginx.conf" ++ set_param_template: 'grep -q "^\\s*$param_name\\s\\+" "$config_file" && sed -i "s|^\\s*$param_name\\s\\+.*| $param_name $param_value;|" "$config_file" || sed -i "/http\\s*{/a\ $param_name $param_value;" "$config_file"' ++ get_param_template: 'grep -E "^\\s*$param_name\\s+" $config_file | head -1 | sed -E "s/^\\s*$param_name\\s+(.*);/\\1/"' ++ stop_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ start_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/nginx/parse_benchmark.sh $host_ip $port" ++ performance_metric: "QPS" ++ ++ceph: ++ set_param_template: 'ceph config set osd "$param_name" "$param_value"' ++ get_param_template: 'sh /path/of/get_params.sh' ++ start_workload: "sh /path/of/restart_ceph.sh" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/ceph/parse_benchmark.sh" ++ performance_metric: "BANDWIDTH" ++ ++gaussdb: ++ user: "" ++ password: "" ++ config_file: "/path/of/config_file" ++ port: 5432 ++ set_param_template: 'gs_guc set -Z datanode -N all -I all -c "${param_name}=${param_value}"' ++ get_param_template: 'gs_guc check -Z datanode -N all -I all -c "${param_name}"' ++ stop_workload: "cm_ctl stop -m i" ++ start_workload: "cm_ctl start" ++ recover_workload: "$EXECUTE_MODE:local sh /path/of/gaussdb_cluster_recover.sh" ++ benchmark: "$EXECUTE_MODE:local sh/path/of/gaussdb_benchmark.sh" ++ performance_metric: "DURATION" ++ ++system: ++ set_param_template: 'sysctl -w $param_name=$param_value' ++ get_param_template: 'sysctl $param_name' ++ ++redis: ++ port: 6379 ++ config_file: "/etc/redis.conf" ++ set_param_template: "sed -i 's/^$param_name/$param_name $param_value/g' $config_file" ++ get_param_template: "grep -P '$param_name' $config_file | awk '{print $2}" ++ start_workload: "systemctl start redis" ++ stop_workload: "systemctl stop redis" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/redis/parse_benchmark.sh $host_ip $port " ++ performance_metric: "QPS" +diff --git a/copilot-tune/config/knob_rag_config.json b/copilot-tune/config/knob_rag_config.json +new file mode 100644 +index 0000000..2eb63f1 +--- /dev/null ++++ b/copilot-tune/config/knob_rag_config.json +@@ -0,0 +1,4 @@ ++{ ++ "threshold": 0.5, ++ "topk": 10 ++} +\ No newline at end of file +diff --git a/copilot-tune/config/openapi.yaml b/copilot-tune/config/openapi.yaml +new file mode 100644 +index 0000000..ddeefb0 +--- /dev/null ++++ b/copilot-tune/config/openapi.yaml +@@ -0,0 +1,69 @@ ++openapi: 3.0.0 ++info: ++ title: 性能调优分析接口 ++ description: 用于采集指定ip机器的性能数据并分析性能瓶颈,推荐可调整参数 ++ version: 1.0.0 ++servers: ++ - url: http://localhost:8092 ++paths: ++ /collector: ++ get: ++ summary: 获取数据采集结果 ++ description: 通过ip获取数据采集结果 ++ parameters: ++ - name: ip ++ in: query ++ required: true ++ schema: ++ type: string ++ example: "9.82.201.111" ++ description: 待采集指标的ip地址 ++ responses: ++ '200': ++ description: 成功采集数据 ++ content: ++ application/json: ++ schema: '#/components/schemas/ApiResponse' ++ /analyzer: ++ get: ++ summary: 瓶颈分析接口 ++ description: 基于采集到的数据分析性能瓶颈 ++ parameters: ++ - name: ip ++ in: query ++ required: true ++ schema: ++ type: string ++ example: "9.82.201.111" ++ description: 待分析瓶颈机器的ip地址 ++ responses: ++ '200': ++ description: 成功采集数据 ++ content: ++ application/json: ++ schema: '#/components/schemas/ApiResponse' ++ /optimizer: ++ get: ++ summary: 参数推荐接口 ++ description: 基于当前瓶颈推荐对应参数 ++ parameters: ++ - name: ip ++ in: query ++ required: true ++ schema: ++ type: string ++ example: "9.82.201.111" ++ description: 待分析瓶颈机器的ip地址 ++ responses: ++ '200': ++ description: 成功采集数据 ++ content: ++ application/json: ++ schema: '#/components/schemas/ApiResponse' ++components: ++ schemas: ++ ApiResponse: ++ type: object ++ properties: ++ data: ++ type: object +diff --git a/copilot-tune/config/optimize_config.yaml b/copilot-tune/config/optimize_config.yaml +new file mode 100644 +index 0000000..71cb539 +--- /dev/null ++++ b/copilot-tune/config/optimize_config.yaml +@@ -0,0 +1,4 @@ ++# Static or Dynamic ++knob_tuning : "static" ++evaluations : "" ++goal : "" +diff --git a/copilot-tune/doc/zh/README.md b/copilot-tune/doc/zh/README.md +new file mode 100644 +index 0000000..5823fad +--- /dev/null ++++ b/copilot-tune/doc/zh/README.md +@@ -0,0 +1,199 @@ ++# EulerCopilot Tune安装使用指南 ++ ++### 项目简介 ++EulerCopilot Tune通过采集系统、微架构、应用等维度的指标数据,结合大模型和定制化的prompt工程,针对不同应用的可调参数给出可靠的参数推荐,同时根据推荐的参数运行benchmark,与baseline做对比并计算出推荐参数对应用性能的提升值。 ++ ++### 软件架构 ++软件架构说明 ++ ++### 安装教程 ++ ++1. 下载gitee源码,gitee代码仓地址: ++https://gitee.com/openeuler/A-Tune/tree/euler-copilot-tune/ ++(注意:分支指定为euler-copilot-tune) ++2. 安装其他依赖 ++```bash ++#1.调优程序运行机器安装python venv依赖 ++yum install python3-devel krb5-devel ++#2.目标应用所在机器安装调优依赖并重启sysstat ++yum install sysstat perf ++systemctl start sysstat ++``` ++3. 调优程序运行机器安装python依赖: ++```BASH ++#1.创建并加载python venv ++python3 -m venv venv ++source venv/bin/activate ++ ++#2.安装python依赖包 ++pip3 install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple ++``` ++ ++### 使用指南 ++ ++1. 准备env yaml,放入项目的config/.env.yaml中,具体格式如下: ++```YAML ++LLM_KEY: "YOUR_LLM_KEY" ++LLM_URL: "YOUR_LLM_URL" ++LLM_MODEL_NAME: "YOUR_LLM_MODEL_NAME" ++LLM_MAX_TOKENS: ++ ++REMOTE_EMBEDDING_ENDPOINT: "YOUR_EMBEDDING_MODEL_URL" ++REMOTE_EMBEDDING_MODEL_NAME: "YOUR_MODEL_NAME" ++ ++servers: ++ - ip: "" #应用所在ip ++ host_user: "" #登录机器的usr id ++ password: "" #登录机器的密码 ++ port: #应用所在ip的具体port ++ app: "mysql" #当前支持mysql、nginx、pgsql、spark ++ target_process_name: "mysqld" #调优应用的name ++ business_context: "高并发数据库服务,CPU负载主要集中在用户态处理" #调优应用的描述(用于策略生成) ++ max_retries: 3 ++ delay: 1.0 ++ ++feature: ++ - need_restart_application: False #修改参数之后是否需要重启应用使参数生效 ++ need_recover_cluster: False #调优过程中是否需要恢复集群 ++ microDep_collector: True #是否开启微架构指标踩采集 ++ pressure_test_mode: True #是否通过压测模拟负载环境 ++ tune_system_param: False #是否调整系统参数 ++ tune_app_param: True #是否调整应用参数 ++ strategy_optimization: False #是否需要策略推荐 ++ benchmark_timeout: 3600 #benchmark执行超时限制 ++``` ++ ++2. 完善app_config.yaml,放入项目的config/app_config.yaml中(重点是补充set_param_template、get_param_template、benchmark脚本),具体内容如下: ++```YAML ++mysql: ++ user: "root" ++ password: "123456" ++ config_file: "/etc/my.cnf" ++ port: 3306 ++ set_param_template: 'grep -q "^$param_name\\s*=" "$config_file" && sed -i "s/^$param_name\\s*=.*/$param_name = $param_value/" "$config_file" || sed -i "/\\[mysqld\\]/a $param_name = $param_value" "$config_file"' ++ get_param_template: 'grep -E "^$param_name\s*=" $config_file | cut -d= -f2- | xargs' ++ stop_workload: "systemctl stop mysqld" ++ start_workload: "systemctl start mysqld" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/mysql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++flink: ++ set_param_template: 'sh /home/wsy/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /home/wsy/get_param.sh $param_name' ++ benchmark: "sh /home/wsy/nexmark_test.sh" ++ stop_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/stop-cluster.sh && /usr/local/nexmark/bin/shutdown_cluster.sh"' ++ start_workload: 'docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/start-cluster.sh"' ++ performance_metric: "THROUGHPUT" ++ ++pgsql: ++ user: "postgres" ++ password: "postgres" ++ config_file: "/data/data1/pgsql/postgresql.conf" ++ port: 5432 ++ set_param_template: 'grep -qE "^\s*$param_name\s*=" "$config_file" && sed -i "s/^[[:space:]]*$param_name[[:space:]]*=.*/$param_name = $param_value/" "$config_file" || echo "$param_name = $param_value" >> "$config_file"' ++ get_param_template: 'grep -oP "^\s*$param_name\s*=\s*\K.*" "$config_file"' ++ stop_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl stop -D /data/data1/pgsql/ -m fast'" ++ start_workload: "su - postgres -c '/usr/local/pgsql/bin/pg_ctl start -D /data/data1/pgsql/ -l /var/log/postgresql/postgresql.log'" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/postgresql/parse_benchmark.sh $host_ip $port $user $password" ++ performance_metric: "QPS" ++ ++spark: ++ set_param_template: 'sh /path/of/set_param.sh $param_name $param_value' ++ get_param_template: 'sh /path/of/get_param.sh $param_name' ++ benchmark: "sh /path/of/spark_benchmark.sh" ++ performance_metric: "DURATION" ++ ++nginx: ++ port: 10000 ++ config_file: "/usr/local/nginx/conf/nginx.conf" ++ set_param_template: 'grep -q "^\\s*$param_name\\s\\+" "$config_file" && sed -i "s|^\\s*$param_name\\s\\+.*| $param_name $param_value;|" "$config_file" || sed -i "/http\\s*{/a\ $param_name $param_value;" "$config_file"' ++ get_param_template: 'grep -E "^\\s*$param_name\\s+" $config_file | head -1 | sed -E "s/^\\s*$param_name\\s+(.*);/\\1/"' ++ stop_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ start_workload: "/usr/local/nginx/sbin/nginx -s reload" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/nginx/parse_benchmark.sh $host_ip $port" ++ performance_metric: "QPS" ++ ++ceph: ++ set_param_template: 'ceph config set osd "$param_name" "$param_value"' ++ get_param_template: 'sh /path/of/get_params.sh' ++ start_workload: "sh /path/of/restart_ceph.sh" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/ceph/parse_benchmark.sh" ++ performance_metric: "BANDWIDTH" ++ ++gaussdb: ++ user: "" ++ password: "" ++ config_file: "/path/of/config_file" ++ port: 5432 ++ set_param_template: 'gs_guc set -Z datanode -N all -I all -c "${param_name}=${param_value}"' ++ get_param_template: 'gs_guc check -Z datanode -N all -I all -c "${param_name}"' ++ stop_workload: "cm_ctl stop -m i" ++ start_workload: "cm_ctl start" ++ recover_workload: "$EXECUTE_MODE:local sh /path/of/gaussdb_cluster_recover.sh" ++ benchmark: "$EXECUTE_MODE:local sh/path/of/gaussdb_benchmark.sh" ++ performance_metric: "DURATION" ++ ++system: ++ set_param_template: 'sysctl -w $param_name=$param_value' ++ get_param_template: 'sysctl $param_name' ++ ++redis: ++ port: 6379 ++ config_file: "/etc/redis.conf" ++ set_param_template: "sed -i 's/^$param_name/$param_name $param_value/g' $config_file" ++ get_param_template: "grep -P '$param_name' $config_file | awk '{print $2}" ++ start_workload: "systemctl start redis" ++ stop_workload: "systemctl stop redis" ++ benchmark: "$EXECUTE_MODE:local sh $SCRIPTS_DIR/redis/parse_benchmark.sh $host_ip $port " ++ performance_metric: "QPS" ++ ++ ++``` ++其中: ++set_param_template:根据调优结果修改应用参数,用于后续测试效果 ++get_param_template:获取应用参数 ++recover_workload: 恢复集群 ++benchmark:benchmark脚本,格式如下: ++ ++```bash ++#(必须有)用于通知框架可以执行指标采集的标识 ++echo 1 > /tmp/euler-copilot-fifo ++ ++#benchmark具体执行 ++cd /root/spark_auto_deploy_arm/spark_test ++sh tpcds_test_1t_spark331_linearity_2p.sh > /home/cxm/spark_benchmark.log 2>&1 ++ ++#(必须有)计算并输出相应的performance_metric的语句 ++cd /home/cxm ++time_taken=$(grep "time_taken:" "spark_benchmark.log" | sed -E 's/.*time_taken:([0-9.]+)s.*/\1/' | paste -sd+ | bc | xargs printf "%.2f") ++echo $time_taken ++``` ++3. 运行EulerCopilot ++```bash ++export PYTHONPATH="`pwd`:$PYTHONPATH" ++python3 src/start_tune.py ++ ++``` ++#### 服务的方式运行: ++ ++1、安装服务 ++ ++​ 进入项目目录,执行python setup.py install ++ ++2、在/etc/euler-copilot-tune 目录修改配置文件,具体内容参考上面源码部署方式 ++ ++3、启动服务 ++ ++~~~bash ++#命令行执行如下命令 ++#开启调优 ++euler-copilot-tune ++#开启mcpserver 日志通过执行 journalctl -xe -u tune-mcpserver --all -f 查看 ++tune-mcpserver ++~~~ ++ ++​ ++ ++### 常见问题解决 ++ ++见 [FAQ.md](./FAQ.md) +\ No newline at end of file +diff --git a/copilot-tune/requirements.txt b/copilot-tune/requirements.txt +new file mode 100644 +index 0000000..2b86bdc +--- /dev/null ++++ b/copilot-tune/requirements.txt +@@ -0,0 +1,20 @@ ++faiss_cpu==1.8.0.post1 ++fastapi==0.112.2 ++numpy==1.26.4 ++openai==1.52.0 ++paramiko==3.4.1 ++pydantic>=2.8.2 ++PyYAML==6.0.2 ++scikit_learn==1.5.1 ++tqdm==4.66.4 ++uvicorn==0.30.6 ++zhipuai==2.1.4.20230814 ++requests==2.32.3 ++langchain==0.3.4 ++langchain-openai==0.2.3 ++email-validator==2.2.0 ++httpx==0.27.2 ++tabulate==0.9.0 ++pyfiglet==1.0.3 ++gssapi==1.9.0 ++pandas==2.3.1 +\ No newline at end of file +diff --git a/copilot-tune/scripts/ceph/benchmark.sh b/copilot-tune/scripts/ceph/benchmark.sh +new file mode 100644 +index 0000000..b372677 +--- /dev/null ++++ b/copilot-tune/scripts/ceph/benchmark.sh +@@ -0,0 +1,2 @@ ++echo 1 > /tmp/euler-copilot-fifo ++sh /root/A-Tune/examples/tuning/ceph/ceph_benchmark.sh +diff --git a/copilot-tune/scripts/ceph/parse_benchmark.sh b/copilot-tune/scripts/ceph/parse_benchmark.sh +new file mode 100644 +index 0000000..a6328d7 +--- /dev/null ++++ b/copilot-tune/scripts/ceph/parse_benchmark.sh +@@ -0,0 +1,7 @@ ++#!/bin/bash ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++sh benchmark.sh > benchmark.log 2>&1 ++grep "bandwidth_val" benchmark.log | awk '{print $3}' ++ +diff --git a/copilot-tune/scripts/flink/benchmark.sh b/copilot-tune/scripts/flink/benchmark.sh +new file mode 100644 +index 0000000..84463c4 +--- /dev/null ++++ b/copilot-tune/scripts/flink/benchmark.sh +@@ -0,0 +1,4 @@ ++#!/bin/bash ++ ++echo 1 > /tmp/euler-copilot-fifo ++docker exec -i flink_jm_8c32g bash -c "source /etc/profile && /usr/local/flink-1.16.3/bin/stop-cluster.sh && /usr/local/nexmark/bin/shutdown_cluster.sh && /usr/local/flink-1.16.3/bin/start-cluster.sh && /usr/local/nexmark/bin/setup_cluster.sh && cd /usr/local/nexmark/bin && ./run_query.sh q0" > /home/wsy/benchmark.log 2>&1 +diff --git a/copilot-tune/scripts/flink/parse_benchmark.sh b/copilot-tune/scripts/flink/parse_benchmark.sh +new file mode 100644 +index 0000000..b785931 +--- /dev/null ++++ b/copilot-tune/scripts/flink/parse_benchmark.sh +@@ -0,0 +1,34 @@ ++#!/bin/bash ++ ++LOG_FILE=/home/wsy/benchmark.log ++ ++convert_to_k() { ++ val=$1 ++ unit=$2 ++ case "$unit" in ++ K/s) echo "$val" ;; ++ M/s) awk -v v="$val" 'BEGIN{printf "%.3f", v*1000}' ;; ++ *) echo 0 ;; ++ esac ++} ++ ++total_throughput=$(grep '|Total' "$LOG_FILE" | awk -F'|' ' ++ { ++ # ~O~V~@~R~U__~L~H~W~L~N__| ++ gsub(/^ +| +$/, "", $(NF-1)) ++ split($(NF-1), a, " ") ++ val = a[1] ++ unit = a[2] ++ if (unit == "M/s") val *= 1000 ++ total += val ++ } ++ END {printf "%.3f\n", total} ++') ++ ++echo "$total_throughput" ++ ++timestamp=$(date +"%Y%m%d-%H%M%S") ++ ++mkdir -p /home/wsy/logs ++ ++mv /home/wsy/benchmark.log "/home/wsy/logs/${timestamp}-benchmark.log" +diff --git a/copilot-tune/scripts/gaussdb/benchmark.sh b/copilot-tune/scripts/gaussdb/benchmark.sh +new file mode 100644 +index 0000000..6919b88 +--- /dev/null ++++ b/copilot-tune/scripts/gaussdb/benchmark.sh +@@ -0,0 +1,2 @@ ++echo 1 > /tmp/euler-copilot-fifo ++sh /data2/zjh/gitcode/TPCH_tools/starter-yyl.sh > /home/wsy/gaussdb_benchmark.log +\ No newline at end of file +diff --git a/copilot-tune/scripts/gaussdb/parse_benmark.sh b/copilot-tune/scripts/gaussdb/parse_benmark.sh +new file mode 100644 +index 0000000..7204cd2 +--- /dev/null ++++ b/copilot-tune/scripts/gaussdb/parse_benmark.sh +@@ -0,0 +1,2 @@ ++cd /data2/zjh/gitcode/TPCH_tools/tpch_test_logs/latest_logs/ ++awk -F',' 'NR>1 {sum += $6} END {printf "%.2f\n", sum}' output_off.csv +\ No newline at end of file +diff --git a/copilot-tune/scripts/mysql/benchmark.sh b/copilot-tune/scripts/mysql/benchmark.sh +new file mode 100644 +index 0000000..cca39c4 +--- /dev/null ++++ b/copilot-tune/scripts/mysql/benchmark.sh +@@ -0,0 +1,4 @@ ++sysbench --db-driver=mysql --mysql-host=$1 --mysql-port=$2 --mysql-user=$3 --mysql-password=$4 --mysql-db=sbtest --table_size=800000 --tables=10 --time=180 --threads=96 --report-interval=10 oltp_read_write prepare ++echo 1 > /tmp/euler-copilot-fifo ++sysbench --mysql-host=$1 --mysql-port=$2 --mysql-user=$3 --mysql-password=$4 --mysql-db=sbtest --mysql-storage-engine=innodb --mysql-ignore-errors=1062,1213,1205,1020 --tables=10 --table-size=800000 --time=180 --events=0 --report-interval=1 --rand-type=uniform --db-driver=mysql --percentile=95 oltp_write_only --forced-shutdown=off --db-ps-mode=disable --threads=128 run ++sysbench --db-driver=mysql --mysql-host=$1 --mysql-port=$2 --mysql-user=$3 --mysql-password=$4 --mysql-db=sbtest --table_size=800000 --tables=10 --time=180 --threads=96 --report-interval=10 oltp_read_write cleanup +\ No newline at end of file +diff --git a/copilot-tune/scripts/mysql/parse_benchmark.sh b/copilot-tune/scripts/mysql/parse_benchmark.sh +new file mode 100644 +index 0000000..cdc832f +--- /dev/null ++++ b/copilot-tune/scripts/mysql/parse_benchmark.sh +@@ -0,0 +1,6 @@ ++#!/bin/bash ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++sh benchmark.sh $1 $2 $3 $4 > benchmark.log 2>&1 ++grep "queries:" benchmark.log | awk -F'[()]' '{print $2}' | awk '{print $1}' +\ No newline at end of file +diff --git a/copilot-tune/scripts/nginx/benchmark.sh b/copilot-tune/scripts/nginx/benchmark.sh +new file mode 100644 +index 0000000..18940c3 +--- /dev/null ++++ b/copilot-tune/scripts/nginx/benchmark.sh +@@ -0,0 +1,5 @@ ++# 获取目标地址和端口 ++TARGET_HOST="$1" ++TARGET_PORT="$2" ++echo 1 > /tmp/euler-copilot-fifo ++httpress -n 20000000 -c 512 -t 7 -k http://${TARGET_HOST}:${TARGET_PORT} +\ No newline at end of file +diff --git a/copilot-tune/scripts/nginx/parse_benchmark.sh b/copilot-tune/scripts/nginx/parse_benchmark.sh +new file mode 100644 +index 0000000..a5c2e28 +--- /dev/null ++++ b/copilot-tune/scripts/nginx/parse_benchmark.sh +@@ -0,0 +1,12 @@ ++#!/bin/bash ++ ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++ ++# 运行benchmark.sh,传入所有参数,输出重定向到benchmark.log ++sh benchmark.sh $1 $2 > benchmark.log 2>&1 ++ ++# 从benchmark.log中提取 rps(Requests per second) ++# 以 httpress 输出的 TIMING 行为例,第四个字段是 rps ++grep 'TIMING:' benchmark.log | awk '{print $4}' +\ No newline at end of file +diff --git a/copilot-tune/scripts/postgresql/benchmark.sh b/copilot-tune/scripts/postgresql/benchmark.sh +new file mode 100644 +index 0000000..566712b +--- /dev/null ++++ b/copilot-tune/scripts/postgresql/benchmark.sh +@@ -0,0 +1,4 @@ ++sysbench --db-driver=pgsql --pgsql-host=$1 --pgsql-port=$2 --pgsql-user=$3 --pgsql-password=$4 --pgsql-db=test_64 --tables=10 --table_size=100000 --threads=32 --report-interval=1 oltp_read_write --rand-type=uniform prepare ++echo 1 > /tmp/euler-copilot-fifo ++sysbench --db-driver=pgsql --pgsql-host=$1 --pgsql-port=$2 --pgsql-user=$3 --pgsql-password=$4 --pgsql-db=test_64 --tables=10 --table_size=100000 --time=180 --threads=32 --report-interval=1 oltp_read_write --rand-type=uniform run ++sysbench --db-driver=pgsql --pgsql-host=$1 --pgsql-port=$2 --pgsql-user=$3 --pgsql-password=$4 --pgsql-db=test_64 --tables=10 --table_size=100000 --time=180 --threads=32 --report-interval=1 oltp_read_write --rand-type=uniform cleanup +diff --git a/copilot-tune/scripts/postgresql/parse_benchmark.sh b/copilot-tune/scripts/postgresql/parse_benchmark.sh +new file mode 100644 +index 0000000..cdc832f +--- /dev/null ++++ b/copilot-tune/scripts/postgresql/parse_benchmark.sh +@@ -0,0 +1,6 @@ ++#!/bin/bash ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++sh benchmark.sh $1 $2 $3 $4 > benchmark.log 2>&1 ++grep "queries:" benchmark.log | awk -F'[()]' '{print $2}' | awk '{print $1}' +\ No newline at end of file +diff --git a/copilot-tune/scripts/redis/benchmark.sh b/copilot-tune/scripts/redis/benchmark.sh +new file mode 100644 +index 0000000..d7e6200 +--- /dev/null ++++ b/copilot-tune/scripts/redis/benchmark.sh +@@ -0,0 +1,17 @@ ++#!/bin/bash ++ ++# 参数设置 ++REDIS_HOST="${1:-127.0.0.1}" ++REDIS_PORT="${2:-6379}" ++ ++echo 1 > /tmp/euler-copilot-fifo ++ ++# 运行 redis-benchmark 并保存输出 ++redis-benchmark -h "$REDIS_HOST" -p "$REDIS_PORT" -t set,get,incr,rpop,sadd,hset,lrange_600 --csv ++ ++echo $OUTPUT > benchmark.log ++ ++# 解析 CSV 输出并计算总 QPS ++TOTAL_QPS=$(echo "$OUTPUT" | awk -F',' '{gsub(/"/,""); sum += $2} END {printf "%.2f", sum}') ++ ++echo "$TOTAL_QPS" +diff --git a/copilot-tune/scripts/redis/parse_benchmark.sh b/copilot-tune/scripts/redis/parse_benchmark.sh +new file mode 100644 +index 0000000..a005128 +--- /dev/null ++++ b/copilot-tune/scripts/redis/parse_benchmark.sh +@@ -0,0 +1,6 @@ ++#!/bin/bash ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++sh benchmark.sh $1 $2 > benchmark.log 2>&1 ++cat benchmark.log | awk -F',' '{gsub(/"/,""); sum += $2} END {printf "%.2f", sum}' +\ No newline at end of file +diff --git a/copilot-tune/scripts/spark/benchmark.sh b/copilot-tune/scripts/spark/benchmark.sh +new file mode 100644 +index 0000000..358fed0 +--- /dev/null ++++ b/copilot-tune/scripts/spark/benchmark.sh +@@ -0,0 +1,3 @@ ++cd /root/spark_auto_deploy_arm/spark_test ++sh tpcds_test_1t_spark331_linearity_2p.sh > /home/cxm/spark_benchmark.log 2>&1 ++ +diff --git a/copilot-tune/scripts/spark/parse_benchmark.sh b/copilot-tune/scripts/spark/parse_benchmark.sh +new file mode 100644 +index 0000000..882c779 +--- /dev/null ++++ b/copilot-tune/scripts/spark/parse_benchmark.sh +@@ -0,0 +1,14 @@ ++#!/bin/bash ++SCRIPT_PATH="$(realpath "$0")" ++SCRIPT_DIR="$(dirname "$SCRIPT_PATH")" ++cd "$SCRIPT_DIR" ++ssh root@9.82.36.53 "cd /home/cxm" ++cd /home/cxm ++time_taken=$(grep "time_taken:" "spark_benchmark.log" \ ++| sed -nE 's/.*time_taken:([0-9.]+)s.*/\1/p' \ ++| grep -E '^[0-9.]+' \ ++| paste -sd+ \ ++| bc \ ++| xargs printf "%.2f") ++ ++echo $time_taken +\ No newline at end of file +diff --git a/copilot-tune/service/tune-mcpserver.service b/copilot-tune/service/tune-mcpserver.service +new file mode 100644 +index 0000000..a7d9ac1 +--- /dev/null ++++ b/copilot-tune/service/tune-mcpserver.service +@@ -0,0 +1,15 @@ ++[Unit] ++Description=euler-copilot-tune MCP Server ++After=network.target ++ ++[Service] ++Type=exec ++ExecStart=/usr/local/bin/tune-mcpserver ++Restart=on-failure ++RestartSec=1 ++KillMode=control-group ++KillSignal=SIGTERM ++TimeoutStopSec=30 ++SuccessExitStatus=143 ++[Install] ++WantedBy=multi-user.target +\ No newline at end of file +diff --git a/copilot-tune/service/tune-openapi.service b/copilot-tune/service/tune-openapi.service +new file mode 100644 +index 0000000..f890a2d +--- /dev/null ++++ b/copilot-tune/service/tune-openapi.service +@@ -0,0 +1,15 @@ ++[Unit] ++Description=euler-copilot-tune open API Server ++After=network.target ++ ++[Service] ++Type=exec ++ExecStart=/usr/local/bin/tune-openapi ++Restart=on-failure ++RestartSec=1 ++KillMode=control-group ++KillSignal=SIGTERM ++TimeoutStopSec=30 ++SuccessExitStatus=143 ++[Install] ++WantedBy=multi-user.target +\ No newline at end of file +diff --git a/copilot-tune/setup.py b/copilot-tune/setup.py +new file mode 100644 +index 0000000..c78fb45 +--- /dev/null ++++ b/copilot-tune/setup.py +@@ -0,0 +1,68 @@ ++import os ++ ++from setuptools import setup, find_packages ++from glob import glob ++ ++cfg_path = "/etc/copilot-tune" ++for root, dirs, files in os.walk(cfg_path): ++ for file in files: ++ os.remove(os.path.join(root, file)) ++ser = "/usr/lib/systemd/system/tune-mcpserver.service" ++if os.path.isfile(ser): ++ os.remove(ser) ++ser = "/usr/lib/systemd/system/tune-openapi.service" ++if os.path.isfile(ser): ++ os.remove(ser) ++ ++ ++def get_recursive_files_with_relpath(src_root_dir): ++ file_mapping = [] ++ for file_path in glob(f'{src_root_dir}/**/*', recursive=True): ++ if os.path.isfile(file_path): # 只处理文件(目录会自动创建) ++ relative_path = os.path.relpath(file_path, src_root_dir) ++ file_mapping.append((file_path, relative_path)) ++ return file_mapping ++ ++ ++# -------------------------- 配置 data_files -------------------------- ++knowledge_src_root = 'src/knowledge_base' ++knowledge_files = get_recursive_files_with_relpath(knowledge_src_root) ++knowledge_data = [] ++for src_file, rel_path in knowledge_files: ++ target_dir = os.path.join('/etc/copilot-tune/knowledge_base/', os.path.dirname(rel_path)) ++ knowledge_data.append((target_dir, [src_file])) ++config_src_root = 'config' ++config_files = get_recursive_files_with_relpath(config_src_root) ++config_data = [] ++for src_file, rel_path in config_files: ++ target_dir = os.path.join('/etc/copilot-tune/config/', os.path.dirname(rel_path)) ++ config_data.append((target_dir, [src_file])) ++ ++scripts_src_root = 'scripts' ++scripts_files = get_recursive_files_with_relpath(scripts_src_root) ++scripts_data = [] ++for src_file, rel_path in scripts_files: ++ target_dir = os.path.join('/etc/copilot-tune/scripts/', os.path.dirname(rel_path)) ++ scripts_data.append((target_dir, [src_file])) ++data_files = [('/etc/copilot-tune/config/', glob('config/*')), ++ ('/etc/copilot-tune/config/', glob('config/.env.yaml')), ++ ('/etc/copilot-tune/scripts/', glob('scripts/*/*')), ++ ('/usr/lib/systemd/system/', glob('service/*'))] + knowledge_data + config_data + scripts_data ++setup( ++ name="copilot-tune", ++ version="1.0", ++ author="xu hou", ++ author_email="houxu5@h-partners.com", ++ description="Tune MCP Service", ++ packages=find_packages(where="."), ++ include_package_data=True, ++ data_files=data_files, ++ entry_points={ ++ "console_scripts": [ ++ "tune-openapi = src.start_workflow:main", ++ "tune-mcpserver = src.start_mcpserver:main", ++ "copilot-tune = src.start_tune:main", ++ ], ++ }, ++ url="https://gitee.com/openeuler/A-Tune/tree/copilot-tune/" ++) +diff --git a/copilot-tune/src/__init__.py b/copilot-tune/src/__init__.py +new file mode 100644 +index 0000000..9e464c7 +--- /dev/null ++++ b/copilot-tune/src/__init__.py +@@ -0,0 +1,11 @@ ++import logging ++from src.utils.common import display_banner ++ ++logging.getLogger("paramiko.transport").propagate = False ++display_banner() ++ ++logging.basicConfig( ++ level=logging.INFO, # 设置日志级别 ++ format="%(asctime)s - %(levelname)s [%(filename)s:%(funcName)s:%(lineno)d] - %(message)s", # 设置日志格式 ++ datefmt="%Y-%m-%d %H:%M:%S", # 设置时间格式 ++) +diff --git a/copilot-tune/src/config.py b/copilot-tune/src/config.py +new file mode 100644 +index 0000000..0e823c7 +--- /dev/null ++++ b/copilot-tune/src/config.py +@@ -0,0 +1,32 @@ ++import os ++import yaml ++ ++from src.utils.constant import ENV_CONFIG_PATH ++ ++ ++class Config: ++ config: dict ++ ++ def __init__(self): ++ if os.getenv("CONFIG"): ++ config_file = os.getenv("CONFIG") ++ else: ++ config_file = os.path.abspath( ++ os.path.join(os.path.dirname(__file__), "..", "config", ".env.yaml") ++ ) ++ if not os.path.exists(config_file) or not os.path.isfile(config_file): ++ config_file = ENV_CONFIG_PATH ++ ++ with open(config_file, 'r', encoding='utf-8') as file: ++ self.config = yaml.safe_load(file) ++ if os.getenv("PROD"): ++ os.remove(config_file) ++ ++ def __getitem__(self, key): ++ if key in self.config: ++ return self.config[key] ++ else: ++ return None ++ ++ ++config = Config() +diff --git a/copilot-tune/src/knowledge_base/README.md b/copilot-tune/src/knowledge_base/README.md +new file mode 100644 +index 0000000..22dd30b +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/README.md +@@ -0,0 +1 @@ ++性能的相关知识库 +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/ceph.json b/copilot-tune/src/knowledge_base/knob_params/ceph.json +new file mode 100644 +index 0000000..80d0921 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/ceph.json +@@ -0,0 +1,641 @@ ++{ ++ "bluestore_throttle_bytes": { ++ "desc": "该参数控制Bluestore的写入速率限制,以字节为单位。可以缓解disk IO瓶颈。增大该值可以提高写入性能,减小则可以降低写入速率以保护其他操作。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": 0, ++ "related_param": [] ++ }, ++ "bluestore_throttle_deferred_bytes": { ++ "desc": "该参数控制Bluestore在写入时的延迟字节数限制。可以缓解disk IO瓶颈。增大该值可以允许更多的延迟写入,减小则可以减少延迟写入的数量。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": 0, ++ "related_param": [] ++ }, ++ "journal_max_write_bytes": { ++ "desc": "该参数定义了日志一次性写入的最大字节数(Bytes)。增大该值可以提高日志写入性能,缓解磁盘IO瓶颈,减小则可能导致日志写入延迟。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": "10485760", ++ "related_param": [] ++ }, ++ "journal_max_write_entries": { ++ "desc": "该参数定义了日志最大写入条目数。增大该值可以提高日志写入性能,减小则可能导致日志写入延迟。该参数可以缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ 1000000 ++ ], ++ "default_value": "100", ++ "related_param": [] ++ }, ++ "osd_client_message_size_cap": { ++ "desc": "OSD客户端消息的最大大小,单位为字节。该参数控制允许在内存中处理的最大客户端数据消息大小,设置此参数可以防止过大的消息导致网络瓶颈,适当的值可以提高网络性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "default_value": "500Mi", ++ "related_param": [] ++ }, ++ "osd_map_cache_size": { ++ "desc": "设置OSD映射缓存的大小,单位为字节。增大该值可以提高OSD映射的访问速度,但会增加内存使用。该参数的默认值为50,建议根据实际需求进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "50", ++ "related_param": [] ++ }, ++ "osd_max_backfills": { ++ "desc": "osd_max_backfills参数用于设置单个OSD允许的最大并发回填操作数。增大此值可以提高数据恢复速度,但可能会导致磁盘IO瓶颈。该参数分别适用于读和写操作。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "default_value": "1", ++ "related_param": [ ++ "osd_mclock_override_recovery_settings" ++ ] ++ }, ++ "osd_max_pgls": { ++ "desc": "设置每个OSD允许的最大PG日志数量。增大该值可以提高日志的处理能力,但可能会导致内存使用增加。该参数的合理配置对于Ceph集群的性能至关重要。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "1Ki", ++ "related_param": [] ++ }, ++ "osd_max_pg_log_entries": { ++ "desc": "该参数设置每个Placement Group(PG)日志的最大条目数。增大该值可以提高日志的处理能力,但可能会导致内存使用增加。修建PGLog时保留的最大PGLog数为该值。该参数的合理配置对于Ceph集群的性能和稳定性至关重要。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "10000", ++ "related_param": [ ++ "osd_min_pg_log_entries", ++ "osd_pg_log_dups_tracked", ++ "osd_target_pg_log_entries_per_osd" ++ ] ++ }, ++ "osd_max_pg_per_osd_hard_ratio": { ++ "desc": "该参数设置每个OSD允许的最大PG数量与OSD数量的比率。增大该值可以提高存储的扩展性,但可能会导致内存和CPU的使用增加。该比率在达到设定值后,OSD将拒绝创建新的PG。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "3.0", ++ "related_param": [ ++ "mon_max_pg_per_osd" ++ ] ++ }, ++ "osd_max_scrubs": { ++ "desc": "设置每个OSD同时进行的最大scrubbing操作数量。增大该值可以提高scrubbing的并发性,但可能会导致CPU和disk IO瓶颈。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "3", ++ "related_param": [] ++ }, ++ "osd_max_write_size": { ++ "desc": "OSD允许的最大写入大小,单位为字节。增大此值可以提高写入性能,但可能会导致内存使用增加,从而影响CPU和内存瓶颈。该参数的最大写入大小为4096字节到1048576字节之间,默认值为4194304字节。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "default_value": "4194304", ++ "related_param": [] ++ }, ++ "osd_mclock_iops_capacity_low_threshold_hdd": { ++ "desc": "该参数用于设置HDD的IOPS容量低阈值(以4KiB块大小为单位),当IOPS容量低于该阈值时,将忽略OSD基准测试结果,并回退到由osd_mclock_max_capacity_iops_hdd定义的最后有效或默认IOPS容量。减小该值可以避免HDD过载,从而缓解disk IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "50.0", ++ "related_param": [ ++ "osd_mclock_max_capacity_iops_hdd" ++ ] ++ }, ++ "osd_mclock_iops_capacity_low_threshold_ssd": { ++ "desc": "该参数定义了SSD的IOPS容量低阈值(以4KiB块大小为单位),当OSD的IOPS性能低于此阈值时,将忽略OSD基准测试结果,并回退到由osd_mclock_max_capacity_iops_ssd定义的最后有效或默认IOPS容量。减小该值可以避免SSD过载,从而缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "1000.0", ++ "related_param": [ ++ "osd_mclock_max_capacity_iops_ssd" ++ ] ++ }, ++ "osd_mclock_iops_capacity_threshold_hdd": { ++ "desc": "该参数定义了HDD的IOPS容量阈值(以4KiB块大小为单位),超过该阈值后将忽略OSD基准测试结果,并回退到由osd_mclock_max_capacity_iops_hdd定义的最后有效或默认IOPS容量。增大该值可以提高HDD的使用效率,从而缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "500.0", ++ "related_param": [ ++ "osd_mclock_max_capacity_iops_hdd" ++ ] ++ }, ++ "osd_mclock_iops_capacity_threshold_ssd": { ++ "desc": "该参数定义了SSD的IOPS容量阈值(以4KiB块大小为单位),超过该阈值后将忽略OSD基准测试结果,并回退到由osd_mclock_max_capacity_iops_ssd定义的最后有效或默认IOPS容量。增大该值可以提高SSD的使用效率,从而缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "80000.0", ++ "related_param": [ ++ "osd_mclock_max_capacity_iops_ssd" ++ ] ++ }, ++ "osd_mclock_max_capacity_iops_hdd": { ++ "desc": "Max random write IOPS capacity (at 4 KiB block size) to consider per OSD (for rotational media)。设置此参数可以帮助优化HDD的使用,缓解disk IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "315.0", ++ "related_param": [] ++ }, ++ "osd_mclock_max_capacity_iops_ssd": { ++ "desc": "Max random write IOPS capacity (at 4 KiB block size) to consider per OSD (for solid state media)。设置此参数可以帮助优化SSD的使用,缓解disk IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "21500.0", ++ "related_param": [] ++ }, ++ "osd_mclock_max_sequential_bandwidth_hdd": { ++ "desc": "该参数定义了OSD在使用旋转介质时考虑的最大顺序带宽(以字节/秒为单位)。设置此参数可以帮助优化HDD的顺序IO性能,缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "size", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "150Mi", ++ "related_param": [] ++ }, ++ "osd_mclock_max_sequential_bandwidth_ssd": { ++ "desc": "该参数用于设置OSD在固态媒体上的最大顺序带宽(以字节/秒为单位),可以帮助优化SSD的顺序IO性能,缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "size", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "1200Mi", ++ "related_param": [] ++ }, ++ "osd_mclock_profile": { ++ "desc": "控制OSD的多时钟配置文件。选择合适的配置文件可以优化IO性能,缓解disk IO瓶颈。可选的配置文件包括高恢复操作、低延迟操作和高客户端操作。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "balanced", "high_recovery_ops", "high_client_ops", "custom" ++ ], ++ "default_value": "high_recovery_ops", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_best_effort_lim": { ++ "desc": "该参数用于控制后台最佳努力调度器的IO限制,影响并发最佳努力恢复操作的数量。增大该值可以允许更多的并发最佳努力恢复请求,从而缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_best_effort_res": { ++ "desc": "该参数用于控制后台最佳努力调度器的资源分配,影响最佳努力恢复操作的资源使用。增大该值可以提高资源分配,从而缓解磁盘IO瓶颈。默认值为0.0,范围在0到1.0之间。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_best_effort_wgt": { ++ "desc": "该参数用于控制每个后台最佳努力调度的IO共享权重,影响最佳努力恢复操作的优先级。增大该值可以提高最佳努力恢复操作的优先级,从而缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "1", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_recovery_lim": { ++ "desc": "该参数用于控制后台恢复调度器的IO限制,限制并发恢复操作的数量。增大该值可以允许更多的并发恢复请求,从而缓解磁盘IO瓶颈。具体来说,它定义了背景恢复的IO限制,适用于需要优化恢复性能的场景。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_recovery_res": { ++ "desc": "IO比例保留用于后台恢复(默认)。该参数控制后台恢复调度器的资源分配,增大该值可以提高资源分配,从而缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_background_recovery_wgt": { ++ "desc": "IO share for each background recovery over reservation. 该参数用于控制后台恢复操作的优先级,增大该值可以提高恢复操作的优先级,从而缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "1", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_client_lim": { ++ "desc": "IO限制用于每个客户端(默认)在预留之上。该参数控制并发客户端请求的数量,增大该值可以允许更多的并发请求,从而缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [] ++ }, ++ "osd_mclock_scheduler_client_res": { ++ "desc": "IO比例为每个客户端保留的资源。增大该值可以提高资源分配,从而缓解CPU瓶颈。默认值为0.0。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1.0 ++ ], ++ "default_value": "0.0", ++ "related_param": [ ++ "osd_op_queue" ++ ] ++ }, ++ "osd_mclock_scheduler_client_wgt": { ++ "desc": "IO share for each client (default) over reservation. 客户端调度器的权重,用于控制客户端请求的调度优先级。增大该值可以提高客户端请求的处理优先级,从而缓解CPU瓶颈。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "1", ++ "related_param": [] ++ }, ++ "osd_memory_target": { ++ "desc": "该选项设置了每个OSD进程能申请到的内存大小,单位为字节。适当设置此参数可以帮助缓解内存瓶颈,确保OSD进程在高负载情况下仍能保持性能。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "4294967296", ++ "related_param": [] ++ }, ++ "osd_min_pg_log_entries": { ++ "desc": "该参数设置每个Placement Group(PG)日志的最小条目数。增大该值可以提高日志的可靠性,但可能会增加内存使用。修建PGLog时,保留的最小PGLog数。该参数的合理配置有助于优化Ceph集群的性能和稳定性。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "250", ++ "related_param": [ ++ "osd_max_pg_log_entries", ++ "osd_pg_log_dups_tracked", ++ "osd_target_pg_log_entries_per_osd" ++ ] ++ }, ++ "osd_op_num_shards": { ++ "desc": "该参数定义了为特定OSD分配的操作分片数。增大此值可以提高并发处理能力,缓解CPU瓶颈,但可能会增加内存使用。建议在性能需求较高的环境中进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "default_value": "1", ++ "related_param": [] ++ }, ++ "osd_op_num_shards_ssd": { ++ "desc": "该参数定义了为特定的OSD(针对固态存储介质)分配的分片数量。增大此值可以提高SSD的并发处理能力,缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 2, ++ 64 ++ ], ++ "default_value": "8", ++ "related_param": [ ++ "osd_op_num_shards" ++ ] ++ }, ++ "osd_op_num_threads_per_shard": { ++ "desc": "每个OSD分片的工作线程数。增大此值可以提高并发处理能力,缓解CPU瓶颈,但可能会导致内存使用增加。建议根据实际负载情况进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 16 ++ ], ++ "default_value": "1", ++ "related_param": [] ++ }, ++ "osd_op_num_threads_per_shard_ssd": { ++ "desc": "每个OSD分片为给定的OSD(针对固态媒体)生成的工作线程数量。增大此值可以提高SSD的并发处理能力,缓解CPU瓶颈,但可能会导致内存使用增加。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 2, ++ 16 ++ ], ++ "default_value": "2", ++ "related_param": [ ++ "osd_op_num_threads_per_shard" ++ ] ++ }, ++ "osd_pool_default_crush_rule": { ++ "desc": "设置新创建池的默认CRUSH规则。该参数决定了在创建复制池时使用的CRUSH规则。默认值为,表示选择ID最低的规则。不同的CRUSH规则会影响数据的分布和冗余策略。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "-1", ++ "related_param": [] ++ }, ++ "osd_pool_default_min_size": { ++ "desc": "设置新创建池的默认最小副本数量。增大该值可以提高数据的可用性,但会增加存储的开销。该参数确保在操作处于降级模式时,满足最小副本数量以避免数据丢失。默认值为0,表示没有特定的最小值。如果设置为0,则最小值为size - (size / 2)。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ 0, ++ 255 ++ ], ++ "default_value": "0", ++ "related_param": [ ++ "osd_pool_default_size" ++ ] ++ }, ++ "osd_pool_default_pgp_num": { ++ "desc": "该参数设置新创建池的默认PG副本数量。增大该值可以提高数据的冗余性,但会增加存储的开销。默认值为64,最小值为1。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "64", ++ "related_param": [ ++ "osd_pool_default_pg_num", ++ "osd_pool_default_pg_autoscale_mode" ++ ] ++ }, ++ "osd_pool_default_pg_num": { ++ "desc": "设置新创建池的默认PG数量。增大该值可以提高数据的并行性,但可能会导致内存和CPU的使用增加。建议根据实际负载和硬件资源进行调整。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "32", ++ "related_param": [ ++ "osd_pool_default_pg_autoscale_mode" ++ ] ++ }, ++ "osd_pool_default_size": { ++ "desc": "默认情况下,Ceph会对RADOS对象创建三个副本。如果希望维护四个副本(一个主副本和三个备份副本),可以通过设置'osd_pool_default_size'参数来实现。增大该值可以提高数据的冗余性,但会增加存储的开销。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ], ++ "default_value": "3", ++ "related_param": [] ++ }, ++ "osd_recovery_max_active": { ++ "desc": "OSD在恢复过程中允许的最大并发操作数。增大此值可以加快恢复速度,但可能会导致磁盘IO和网络瓶颈。建议根据集群负载情况进行调整。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "default_value": "3", ++ "related_param": [ ++ "osd_recovery_max_active_hdd", ++ "osd_recovery_max_active_ssd", ++ "osd_mclock_override_recovery_settings" ++ ] ++ }, ++ "osd_recovery_op_priority": { ++ "desc": "osd_recovery_op_priority参数用于设置恢复操作相对于客户端操作的优先级。较高的优先级可以加快恢复速度,但可能会影响正常操作的性能,导致CPU和磁盘IO瓶颈。该参数的取值范围为1-63,值越高占用的资源越多。", ++ "type": "discrete", ++ "dtype": "integer", ++ "range": [ ++ 1, ++ 63 ++ ], ++ "default_value": "3", ++ "related_param": [] ++ }, ++ "osd_scrub_auto_repair": { ++ "desc": "启用该参数将自动修复在scrubbing或deep-scrubbing过程中发现的错误,从而减少手动干预,提高系统的可靠性。该功能有助于确保数据的完整性和可用性。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "false", ++ "related_param": [ ++ "osd_scrub_auto_repair_num_errors" ++ ] ++ }, ++ "osd_scrub_auto_repair_num_errors": { ++ "desc": "在scrubbing过程中,OSD允许自动修复的最大错误数量。若发现的错误数量超过该值,则不会进行自动修复。增大该值可以提高修复能力,但可能会增加CPU和磁盘IO的负担。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "5", ++ "related_param": [ ++ "osd_scrub_auto_repair" ++ ] ++ }, ++ "osd_scrub_load_threshold": { ++ "desc": "设置OSD在进行scrubbing时允许的最大负载阈值。增大该值可以使OSD在高负载情况下仍然进行scrubbing,可能会导致CPU和disk IO瓶颈加重。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "0.5", ++ "related_param": [] ++ }, ++ "osd_scrub_max_interval": { ++ "desc": "该参数定义了每个PG进行数据清理操作的最大间隔时间,单位为秒。增大该值可以减少清理操作的频率,从而缓解CPU和磁盘IO的瓶颈。默认值为7天(即86400秒)。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "7 days", ++ "related_param": [ ++ "osd_scrub_min_interval" ++ ] ++ }, ++ "osd_scrub_min_interval": { ++ "desc": "设置OSD进行scrubbing操作的最小时间间隔,单位为秒。增大该值可以减少scrubbing对系统性能的影响,特别是在高负载情况下,缓解CPU和disk IO瓶颈。默认值为1天(86400秒)。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "86400", ++ "related_param": [ ++ "osd_scrub_max_interval" ++ ] ++ }, ++ "osd_scrub_priority": { ++ "desc": "osd_scrub_priority参数用于控制Ceph集群中数据清理操作的默认优先级。当池未指定scrub_priority值时,使用该参数的值。增大该值可以提高清理操作的优先级,从而缓解磁盘IO瓶颈,确保在清理操作阻塞客户端操作时,可以提升到osd_client_op_priority的值。", ++ "type": "discrete", ++ "dtype": "uint", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "5", ++ "related_param": [] ++ }, ++ "rbd_cache": { ++ "desc": "该参数控制RBD的缓存功能。启用缓存可以缓解磁盘IO和网络瓶颈,提高性能。取值为'false'时禁用缓存,'true'时启用缓存。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "false", ++ "related_param": [] ++ }, ++ "rbd_cache_max_dirty": { ++ "desc": "该参数定义了RBD缓存中允许的最大脏数据字节数(Bytes)。当缓存模式为write-back时,增大该值可以允许更多的脏数据,从而缓解内存瓶颈;而设置为0则会使用write-through模式,可能会影响性能。需要注意的是,增大该值可能会增加数据丢失的风险。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ null ++ ], ++ "default_value": "25165824", ++ "related_param": [] ++ }, ++ "rbd_cache_max_dirty_age": { ++ "desc": "该参数定义了RBD缓存中脏数据的最大存在时间,以秒为单位。在被刷新到存储盘前,脏数据可以在缓存中存在的时间。增大该值可以允许脏数据存在更长时间,从而缓解内存瓶颈,但可能导致数据更新不及时。默认值为60秒,最大值为86400秒。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ 86400 ++ ], ++ "default_value": "60", ++ "related_param": [] ++ }, ++ "rbd_cache_size": { ++ "desc": "该参数定义了RBD缓存的大小,以字节为单位。增大该值可以提高缓存命中率,减小则可能导致缓存不足,进而缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": "33554432", ++ "related_param": [] ++ }, ++ "rbd_cache_writethrough_until_flush": { ++ "desc": "该参数控制在写入期间是否使用直写缓存,直到数据被刷新。该选项是为了兼容linux-2.6.32之前的virtio驱动,避免因为不发送flush请求而导致数据不回写。启用该参数可以缓解磁盘IO瓶颈。取值为'true'时启用,'false'时禁用。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ null, ++ null ++ ], ++ "default_value": "true", ++ "related_param": [] ++ } ++} ++ +diff --git a/copilot-tune/src/knowledge_base/knob_params/flink.json b/copilot-tune/src/knowledge_base/knob_params/flink.json +new file mode 100644 +index 0000000..670f2a7 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/flink.json +@@ -0,0 +1,300 @@ ++{ ++ "execution.checkpointing.interval": { ++ "desc": "设置执行检查点的时间间隔,单位为毫秒。该参数用于启用和控制状态快照的频率,能够缓解系统在状态恢复时的磁盘IO瓶颈。增大该值可以减少检查点的频率,从而降低系统负载,但可能导致恢复时间增加;减小该值可以提高数据一致性,但会增加系统负载。要启用检查点,必须将此值设置为大于0的正整数。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 10000 ++ }, ++ "execution.checkpointing.max-concurrent-checkpoints": { ++ "desc": "设置最大并发检查点的数量,用于控制同时进行的检查点数量。增大该值可以提高系统的并发能力,但可能导致资源竞争;减小该值可以降低系统负载,但可能影响数据一致性。该参数能够缓解系统在 checkpoint 过程中可能出现的 disk IO 瓶颈。", ++ "type": "discrete", ++ "range": [ ++ 1, ++ 10 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ }, ++ "execution.checkpointing.min-pause": { ++ "desc": "设置两个连续检查点之间的最小暂停时间,单位为毫秒。该参数用于控制检查点之间的间隔,能够缓解系统在 checkpoint 过程中可能出现的 CPU 瓶颈。增大该值可以减少检查点的频率,降低系统负载;减小该值可以提高检查点的频率,但可能导致系统资源竞争。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "0 ms" ++ }, ++ "execution.checkpointing.timeout": { ++ "desc": "设置检查点的超时时间,单位为毫秒;用于控制检查点的最大执行时间,能够缓解系统在 checkpoint 过程中可能出现的 memory 瓶颈。增大该值可以容忍更长的检查点时间,但可能导致系统资源占用增加;减小该值可以加快失败检测,但可能导致频繁的检查点失败。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 60000 ++ }, ++ "jobmanager.memory.flink.size": { ++ "desc": "JobManager的总内存大小,包括JVM堆内存和非堆内存,但不包括JVM Metaspace和JVM Overhead。增加此值可以缓解内存瓶颈,提升JobManager的性能,主要用于Flink的内部数据结构和状态管理。", ++ "type": "discrete", ++ "range": [ ++ null, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "(none)" ++ }, ++ "jobmanager.memory.jvm-overhead.max": { ++ "desc": "设置 JobManager JVM 的最大开销内存,确保 JVM 在高负载下有足够的内存。增加此值可以缓解内存瓶颈,提升系统的稳定性。此参数用于指定为 JVM 开销保留的非堆内存,例如线程栈空间、编译缓存等。", ++ "type": "discrete", ++ "range": [ ++ null, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "1 gb" ++ }, ++ "jobmanager.memory.jvm-overhead.min": { ++ "desc": "设置 JobManager JVM 的最小开销内存,确保 JVM 有足够的内存用于运行。此参数用于预留 JVM 的非堆内存开销,包括线程栈空间、编译缓存等。增加此值可以缓解内存瓶颈,避免 JVM 频繁进行垃圾回收。", ++ "type": "discrete", ++ "range": [ ++ null, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "192 mb" ++ }, ++ "jobmanager.memory.process.size": { ++ "desc": "设置 JobManager 进程的总内存大小,包括 JVM heap、off-heap、网络缓冲等。此参数影响 JobManager 的内存使用情况,适当增加此值可以缓解内存瓶颈,确保 JobManager 能够处理更多的任务和状态信息。该值包括所有 JobManager JVM 进程消耗的内存,包括总 Flink 内存、JVM Metaspace 和 JVM Overhead。", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": "(none)" ++ }, ++ "parallelism.default": { ++ "desc": "设置 Flink 作业的默认并行度,若未指定并行度则使用该值。增加此值可以缓解 CPU 瓶颈,提升作业的执行效率。默认值为1,最大值为128。", ++ "type": "discrete", ++ "range": [ ++ null, ++ null ++ ], ++ "dtype": "int", ++ "default_value": "1" ++ }, ++ "restart-strategy.fixed-delay.attempts": { ++ "desc": "设置固定延迟重启策略的重启尝试次数;用于控制在任务失败时的重启次数,能够缓解系统在故障恢复时的 CPU 瓶颈。增大该值可以提高任务的恢复能力,但可能导致资源占用增加;减小该值可以加快失败检测,但可能导致任务丢失。", ++ "type": "discrete", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ }, ++ "restart-strategy.fixed-delay.delay": { ++ "desc": "该参数设置固定延迟重启策略的重启延迟时间,单位为毫秒。它用于控制在任务失败后的重启等待时间,能够缓解系统在故障恢复时的网络瓶颈。增大该值可以减少重启频率,降低系统负载;减小该值可以加快重启速度,但可能导致资源竞争。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 10000 ++ }, ++ "restart-strategy": { ++ "desc": "定义在任务失败时的重启策略,以缓解系统在故障恢复时的内存瓶颈。可选值包括:'none'(不重启)、'fixed-delay'(固定延迟重启)、'failure-rate'(基于失败率的重启)、'exponential-delay'(指数延迟重启)。", ++ "type": "discrete", ++ "range": null, ++ "dtype": "string", ++ "default_value": "none" ++ }, ++ "state.backend.incremental": { ++ "desc": "该参数用于设置是否启用增量快照,控制状态快照的存储方式。启用增量快照可以减少存储空间的使用,并缓解系统在状态管理时的磁盘IO瓶颈,但可能会增加恢复时间。", ++ "type": "discrete", ++ "range": null, ++ "dtype": "boolean", ++ "default_value": "false" ++ }, ++ "state.backend": { ++ "desc": "设置状态后端类型;用于定义 Flink 如何存储状态,能够缓解系统在状态管理时的 disk IO 瓶颈。可选值包括 'memory'、'filesystem' 和 'rocksdb'。", ++ "type": "discrete", ++ "range": null, ++ "dtype": "string", ++ "default_value": "memory" ++ }, ++ "state.backend.rocksdb.block.cache-size": { ++ "desc": "设置 RocksDB 的块缓存大小,单位为字节。该参数用于控制 RocksDB 在内存中缓存的数据块数量,从而提高读取性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "8 mb" ++ }, ++ "state.backend.rocksdb.checkpoint.transfer.thread.num": { ++ "desc": "该参数设置每个有状态操作符用于传输(下载和上传)RocksDBStateBackend文件的线程数量。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ }, ++ "state.backend.rocksdb.compaction.level.max-size-level-base": { ++ "desc": "设置 RocksDB 的最大压缩级别基础大小,单位为字节。该参数用于优化存储性能,能够缓解系统在状态管理时的磁盘 I/O 瓶颈。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "256 mb" ++ }, ++ "state.backend.rocksdb.thread.num": { ++ "desc": "该参数设置 RocksDB 的最大并发后台刷新和压缩作业数量(每个有状态操作符)。", ++ "type": "discrete", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 2 ++ }, ++ "state.backend.rocksdb.use-bloom-filter": { ++ "desc": "该参数用于设置是否启用 Bloom Filter,以优化 RocksDB 的查询性能。", ++ "type": "discrete", ++ "range": null, ++ "dtype": "boolean", ++ "default_value": "false" ++ }, ++ "state.backend.rocksdb.write-batch-size": { ++ "desc": "设置 RocksDB 的写批处理大小,单位为字节。该参数用于优化写入性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "2 mb" ++ }, ++ "state.backend.rocksdb.writebuffer.count": { ++ "desc": "设置 RocksDB 的写缓冲区数量,用于优化写入性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ null ++ ], ++ "dtype": "int", ++ "default_value": 2 ++ }, ++ "taskmanager.memory.flink.size": { ++ "desc": "设置 TaskManager 用于 Flink 的内存大小,主要用于 Flink 的内部数据结构和状态管理。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 1024 ++ }, ++ "taskmanager.memory.managed.fraction": { ++ "desc": "该参数设置 TaskManager 管理的内存占总内存的比例,控制 Flink 的内存使用。", ++ "type": "continuous", ++ "range": [ ++ 0.0, ++ 1.0 ++ ], ++ "dtype": "float", ++ "default_value": 0.4 ++ }, ++ "taskmanager.memory.managed.size": { ++ "desc": "设置 TaskManager 管理的内存大小,主要用于 Flink 的状态管理和数据处理。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ }, ++ "taskmanager.memory.network.fraction": { ++ "desc": "设置 TaskManager 网络缓冲区占总内存的比例,控制网络缓冲区的大小。", ++ "type": "continuous", ++ "range": [ ++ 0.0, ++ 1.0 ++ ], ++ "dtype": "float", ++ "default_value": 0.1 ++ }, ++ "taskmanager.memory.network.max": { ++ "desc": "最大网络内存大小,用于TaskExecutors。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "1 gb" ++ }, ++ "taskmanager.memory.network.min": { ++ "desc": "设置 TaskManager 网络缓冲区的最小大小,确保在低负载情况下网络缓冲区不会过小。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "string", ++ "default_value": "64 mb" ++ }, ++ "taskmanager.memory.process.size": { ++ "desc": "设置 TaskManager 进程的总内存大小,包括 JVM heap、off-heap、网络缓冲等。", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 1024 ++ }, ++ "taskmanager.memory.task.heap.size": { ++ "desc": "设置 TaskManager 任务的堆内存大小,主要用于任务的计算和状态管理。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 1024 ++ }, ++ "taskmanager.memory.task.off-heap.size": { ++ "desc": "设置 TaskManager 任务的 off-heap 内存大小,主要用于存储大对象和避免 JVM 的垃圾回收。", ++ "type": "discrete", ++ "range": [ ++ null, ++ null ++ ], ++ "dtype": "string", ++ "default_value": "0 bytes" ++ }, ++ "taskmanager.numberOfTaskSlots": { ++ "desc": "该参数设置 TaskManager 的任务槽数量,决定了单个 TaskManager 可以并行处理的任务数量。", ++ "type": "discrete", ++ "range": [ ++ 1, ++ 128 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/gaussdb.json b/copilot-tune/src/knowledge_base/knob_params/gaussdb.json +new file mode 100644 +index 0000000..95f6466 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/gaussdb.json +@@ -0,0 +1,382 @@ ++{ ++ "work_mem": { ++ "desc": "设置内部排序操作和Hash表在开始写入临时磁盘文件之前使用的内存大小。ORDER BY,DISTINCT和merge joins都要用到排序操作。Hash表在散列连接、散列为基础的聚集、散列为基础的IN子查询处理中都要用到。建议此参数设置值大于算子执行时占用内存的最大值,否则可能会触发落盘", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 65536 ++ }, ++ "max_connections": { ++ "desc": "允许和数据库连接的最大并发连接数。此参数会影响数据库的并发能力。该参数调小会导致实际可用连接数变小。设置过大可能因内存不足导致DN启动失败或运行时异常", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 40000 ++ ], ++ "dtype": "Integer", ++ "default_value": 40000 ++ }, ++ "checkpoint_segments": { ++ "desc": "设置checkpoint_timeout周期内所保留WAL日志文件的最大数量。每个日志文件大小为16MB。提升此参数可加快大数据的导入速度,但需要结合checkpoint_timeout、shared_buffers这两个参数统一考虑。该参数设置过小会导致频繁触发checkpoint;设置过大可能会增大RTO,且会导致WAL日志保留的文件数量增多,增大占用磁盘空间。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 2147483646 ++ ], ++ "dtype": "Integer", ++ "default_value": 1024 ++ }, ++ "checkpoint_timeout": { ++ "desc": "设置自动检查点之间的最长时间。在提升checkpoint_segments以加快大数据导入的场景需将此参数调大,同时这两个参数提升会加大shared_buffers的负担,需要综合考虑。", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 3600 ++ ], ++ "dtype": "Integer", ++ "default_value": 900 ++ }, ++ "max_files_per_process": { ++ "desc": "设置每个数据库线程允许同时打开的最大文件数量。", ++ "type": "continuous", ++ "range": [ ++ 25, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 1024 ++ }, ++ "max_prepared_transactions": { ++ "desc": "设置可以同时处于“预备”状态的事务的最大数目。增加此参数的值会使GaussDB Kernel比系统默认设置需要更多的System V共享内存。一般推荐200(196核CPU/1536G内存,128核CPU/1024G内存,104核CPU/1024G内存,96核CPU/1024G内存,96核CPU/768G内存,80核CPU/640G内存,64核CPU/512G内存,60核CPU/480G内存,32核CPU/256G内存,16核CPU/128G内存,8核CPU/64G内存,4核CPU/32G内存);0(4核CPU/16G内存)", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 262143 ++ ], ++ "dtype": "Integer", ++ "default_value": 200 ++ }, ++ "shared_buffers": { ++ "desc": "设置GaussDB Kernel使用的共享内存大小。增加此参数的值会使GaussDBKernel比系统默认设置需要更多的System V共享内存。建议设置shared_buffers值为内存的40%以内。如果设置较大的shared_buffers,需要同时增加checkpoint_segments的值,因为写入大量新增数据、修改数据需要消耗更多的时间周期。", ++ "type": "continuous", ++ "range": [ ++ 16, ++ 1073741823 ++ ], ++ "dtype": "Integer", ++ "default_value": 35389440 ++ }, ++ "use_workload_manager": { ++ "desc": "设置是否开启资源管理功能。多租数据库特性(enable_mtd)需要开启该参数。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "wal_buffers": { ++ "desc": "设置用于存放WAL数据的共享内存空间的大小,可以以XLOG_BLCKSZ数量表示,同时也可以以实际的大小表示。XLOG_BLCKSZ是XLOG块的大小,一般默认为8kB。取值2048表示wal_buffers为2048 * 8kB;取值20480kB表示wal_buffers为20480kB。取值如果要带单位,必须为kB、MB、GB,且必须设置为8kB整数倍。", ++ "type": "continuous", ++ "range": [ ++ -1, ++ 262144 ++ ], ++ "dtype": "Integer", ++ "default_value": 131072 ++ }, ++ "maintenance_work_mem": { ++ "desc": "指定在维护性操作中使用的最大的内存量。参数取值一定要大于向量索引构建时对数据进行采样所需的内存。对于gsivfflat索引,采样所需内存估算为max(nlist, nlist2) * dim * 0.2kB。对于gsdiskann索引,开启pq时,采样所需内存为dim * 80kB。该参数设置过小,会导致索引无法正常创建,其他需要较大内存的业务也会失败", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 65536 ++ }, ++ "vacuum_cost_limit": { ++ "desc": "设置清理线程休眠的开销限制。当vacuum线程执行的代价达到vacuum_cost_limit时,该线程休眠一次,休眠vacuum_cost_delay时间。该值越大,Vacuum 的I/O频次限制越小,越不容易进入休眠状态,Vacuum效率越高,对业务I/O的影响就越大。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 10000 ++ ], ++ "dtype": "Integer", ++ "default_value": 1000 ++ }, ++ "autovacuum_max_workers": { ++ "desc": "设置能同时运行的自动清理线程的最大数量,该参数的取值上限与GUC参数max_connections和job_queue_processes大小有关。此参数设置越大,自动清理功能创建的线程数越多,占用的系统CPU和内存资源越多,所以设置时不建议设置过大,避免由于此参数设置过大导致内存无法分配或者占用过多CPU资源,导致数据库启动报错或业务受到影响。如果将该参数设置得过低,可能会导致autovacuum线程无法及时清理和回收不再使用的表空间,从而导致数据库膨胀和性能下降。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 262143 ++ ], ++ "dtype": "Integer", ++ "default_value": 3 ++ }, ++ "autovacuum_naptime": { ++ "desc": "设置两次自动清理操作的时间间隔。例如,不带单位取值600,表示autovacuum_naptime为600s;带单位取值10min,表示autovacuum_naptime为10min。取值如果要带单位,必须为s、min、h、d。仅用Ustore表时推荐使用默认值。仅用Ustore表时推荐使用默认值。对于Astore表,设置值取决于用户的数据库环境和应用程序负载。如果数据库环境中有大量表的更新或删除操作,建议将该参数设置为较短的时间间隔,以确保及时清理无用的数据并避免数据库空间不足的情况。如果数据库环境中的更新或删除操作较少,则可以将该参数设置为较长的时间间隔,以减少自动清理线程的开销。如果将autovacuum_naptime设置为过短的时间间隔,可能会导致autovacuum线程的开销过大,从而影响数据库的性能。如果将autovacuum_naptime设置为过长的时间间隔,可能会导致无用的数据堆积,从而占用数据库空间,影响数据库的性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 2147483 ++ ], ++ "dtype": "Integer", ++ "default_value": 600 ++ }, ++ "autovacuum_vacuum_cost_delay": { ++ "desc": "设置在自动VACUUM操作里使用的开销延迟数值。其中-1表示使用常规的vacuum_cost_delay。", ++ "type": "continuous", ++ "range": [ ++ -1, ++ 100 ++ ], ++ "dtype": "Integer", ++ "default_value": 20 ++ }, ++ "autovacuum_vacuum_scale_factor": { ++ "desc": "VACUUM时表的规模因子,用于计算触发VACUUM的阈值。触发VACUUM的阈值=autovacuum_vacuum_threshold+reltuples(表上元组的个数)*autovacuum_vacuum_scale_factor。当表上被删除或更新的记录数超过阈值时,才会对这个表执行VACUUM操作。", ++ "type": "continuous", ++ "range": [ ++ "0.0", ++ "100.0" ++ ], ++ "dtype": "float", ++ "default_value": 0.2 ++ }, ++ "autovacuum_analyze_scale_factor": { ++ "desc": "ANALYZE时表的规模因子,用于计算触发ANALYZE的阈值。触发ANALYZE的阈值=autovacuum_analyze_threshold+reltuples(表上元组的个数)*autovacuum_analyze_scale_factor。当表上被删除、插入或更新的记录数超过阈值时,才会对这个表执行ANALYZE操作。", ++ "type": "continuous", ++ "range": [ ++ "0.0", ++ "100.0" ++ ], ++ "dtype": "float", ++ "default_value": 0.1 ++ }, ++ "enable_thread_pool": { ++ "desc": "控制是否使用线程池功能。多租数据库特性(enable_mtd)需要开启该参数。在低并发长连接性能敏感场景下,建议关闭该参数,其他场景下建议打开该参数。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "off" ++ }, ++ "session_timeout": { ++ "desc": "表明与服务器建立连接后,不进行任何操作的最长时间。当该参数取值不为0,用户不进行任何操作的时间超过该参数取值后,会与服务器断开连接。0表示关闭超时设置。正数表示与服务器建立连接后,不进行任何操作的最长时间。当超过设定的值后,会与服务器断开连接。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 86400 ++ ], ++ "dtype": "Integer", ++ "default_value": 600 ++ }, ++ "enable_beta_opfusion": { ++ "desc": "在enable_opfusion参数打开的状态下,如果开启该参数,可以支持TPCC中出现的聚集函数,排序两类SQL语句的加速执行,提升SQL执行性能。该参数可在PDB级别设置。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "off" ++ }, ++ "bgwriter_flush_after": { ++ "desc": "设置background writer线程刷脏页个数超过设定的阈值时,告知操作系统将文件缓存中的数据页面异步刷盘。GaussDB Kernel中,磁盘页大小为8kB。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 256 ++ ], ++ "dtype": "continuous", ++ "default_value": 64 ++ }, ++ "wal_keep_segments": { ++ "desc": "Xlog日志文件的数量,设置“pg_xlog”目录下保留日志文件的最小数量。备机通过获取主机的Xlog日志进行流复制。 当服务器开启日志归档或者从检查点恢复时,保留的日志文件数量可能大于wal_keep_segments设定的值。当双机为异步传输时,以COPY方式连续导入4G以上数据需要增大wal_keep_segments配置。以T6000单板为例,如果导入数据量为50G,建议调整参数为1000。可以在导入完成并且日志同步正常后,动态恢复此参数设置。若synchronous_commit级别小于LOCAL_FLUSH,重建备机时,建议调大该参数为1000,避免重建过程中,主机日志回收导致重建失败。如果设置过大,可能会导致pg_xlog文件夹占用过大的磁盘空间。如果设置过小,则在备机请求事务日志时,此事务日志可能已经被产生的新事务日志覆盖,导致请求失败,主备关系断开。", ++ "type": "continuous", ++ "range": [ ++ 2, ++ 2147483647 ++ ], ++ "dtype": "continuous", ++ "default_value": 128 ++ }, ++ "bgwriter_delay": { ++ "desc": "定期循环逐块扫描缓冲区刷脏的时间间隔。全量checkpoint模式下,根据bgwriter_lru_maxpages参数控制每次写的量,然后休眠bgwriter_delay毫秒后才再次启动;增量checkpoint模式下,根据设定candidate_buf_percent_target计算目标空闲缓冲页面个数,当候选缓冲页面不足时,每隔bgwriter_delay毫秒扫描一次10GB的缓冲区页面,当遇到脏页时把脏页刷盘后再放进候选链表,当遇到干净的页面直接放进候选链表。在许多系统上,休眠延时的有效分辨率是10毫秒。因此,设置一个不是10倍数的数值与把它设置为一个10的倍数是一样的效果。", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 10000 ++ ], ++ "dtype": "continuous", ++ "default_value": 2000 ++ }, ++ "incremental_checkpoint_timeout": { ++ "desc": "开启增量检查点开关(enable_incremental_checkpoint)之后,设置自动检查点之间的最长时间。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 3600 ++ ], ++ "dtype": "continuous", ++ "default_value": 60 ++ }, ++ "walwriter_sleep_threshold": { ++ "desc": "xlogflusher进入休眠之前空闲xlog刷新的次数,达到阈值会休眠。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 50000 ++ ], ++ "dtype": "continuous", ++ "default_value": 50 ++ }, ++ "reserve_space_for_nullable_atts": { ++ "desc": "指定是否为Ustore表的可空属性预留空间。该参数可在PDB级别设置。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "enable_default_ustore_table": { ++ "desc": "指定是否默认使用Ustore存储引擎。仅当enable_ustore参数开启时,该参数设置有效。该参数为on时,创建的表默认都为Ustore表;该参数为off时,创建的表默认都为Astore表。该参数可在PDB级别设置。升级场景下该参数的默认值会优先保证兼容性,即升级后的默认值与老版本的取值一致。需要特别注意的是,使用Ustore表时,必须要开启track_counts和track_activities参数,否则会引起空间膨胀。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "max_redo_log_size": { ++ "desc": "备DN表示最新日志回放点到最新检查点位置之间日志量的期望值,主DN表示最新日志插入点到最新检查点位置之间日志量的期望值。该参数被用来参与控制检查点刷页速度。", ++ "type": "continuous", ++ "range": [ ++ 163840, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 1048576 ++ }, ++ "recovery_time_target": { ++ "desc": "备机完成日志写入和回放的流控阈值。当备机的回放完成预期时间大于此参数,主机会触发日志流控,将降低主机向备机发送日志的速率。设置过小会影响主机的性能,例如设置为10,说明备机回放相对备机接收日志有延迟,允许延迟最大为10s,如果超过了10s,则会限制主机往备机段发送日志,对主机的性能会有限制,在主备场景下,主机的事务需要有备机日志落盘才能提交,限制了日志的发送会限制主机的事务执行影响到性能。设置过大会失去流控效果。关闭流控后,可能会出现RTO/RPO升高的情况。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 3600 ++ ], ++ "dtype": "Integer", ++ "default_value": 60 ++ }, ++ "lockwait_timeout": { ++ "desc": "控制单个锁的最长等待时间。当申请的锁等待时间超过设定值时,系统会报错。该参数仅针对常规锁生效。该参数可在PDB级别设置。设置过大时,锁冲突造成的阻塞时间更长;设置过小时,可能会出现更多的锁超时报错。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 120000 ++ }, ++ "update_lockwait_timeout": { ++ "desc": "在业务允许并发更新的情况下,该参数控制并发更新同一行时单个锁的最长等待时间。取值如果要带单位,必须为ms、s、min、h、d。当申请的锁等待时间超过设定值时,系统会报错。该参数仅针对常规锁生效。该参数可在PDB级别设置。设置值主要取决于并发更新同一行的事务的workload。一般TP业务,执行时间较短小于两分钟,因此等待并发更新事务提交时间较短,按默认值2min的配置误报锁超时概率较小;如果业务出现大量并发更新同一行事务,执行时间超过两分钟,可以调大该参数避免等锁超时误报。设置过大时,并发更新同一行时锁冲突造成的阻塞时间更长;设置过小时,可能会出现更多的锁超时报错。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 120000 ++ }, ++ "recovery_parse_workers": { ++ "desc": "是极致RTO特性中ParseRedoRecord线程的数量。1. 需要配合recovery_redo_workers使用。recovery_parse_workers或者recovery_redo_workers大于1,则开启极致RTO。如不期望开启极致RTO,请保持recovery_parse_workers为默认值1;2. 在开启极致RTO时确保参数replication_type为1;3. 若同时开启极致RTO和并行回放,则极致RTO特性生效,并行回放特性失效;4. 极致RTO不支持流控,流控统一由recovery_time_target参数控制。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 16 ++ ], ++ "dtype": "Integer", ++ "default_value": 1 ++ }, ++ "recovery_max_workers": { ++ "desc": "设置并行回放线程的最大数量。recovery_parse_workers和recovery_redo_workers同时为1,即不开启极致RTO,此参数才能生效。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 20 ++ ], ++ "dtype": "Integer", ++ "default_value": 4 ++ }, ++ "track_counts": { ++ "desc": "控制是否收集数据库活动的统计数据。该参数可在PDB级别设置。在AutoVacuum自动清理线程中选择清理的数据库时,需要数据库的统计数据,故默认值设置为on。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "enable_pbe_optimization": { ++ "desc": "设置优化器是否对以PBE(Parse Bind Execute)形式执行的语句进行查询计划的优化,优化原理是使FQS选择gplan计划。该参数可在PDB级别设置。", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "off" ++ }, ++ "check_disconnect_query": { ++ "desc": "该参数控制的是,当客户端异常断连(如JDBC触发socketTimeout、libpq触发rwtimeout且关闭连接、运行业务过程中客户端进程终止等)后,GaussDB服务端语句是否终止执行", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "enable_cached_context": { ++ "desc": "控制缓存内存上下文的个数。多租场景下,该参数可在PDB级别设置。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1024 ++ ], ++ "dtype": "Integer", ++ "default_value": "128" ++ }, ++ "enable_cached_operator": { ++ "desc": "控制算子执行态缓存特性的开关", ++ "type": "discrete", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "dtype": "boolean", ++ "default_value": "on" ++ }, ++ "max_process_memory": { ++ "desc": "设置一个数据库节点可用的最大物理内存。该参数用于防止数据库内存使用超出系统承载能力,避免节点OOM。建议值为物理内存的66.5%以内(考虑预留操作系统内存)。修改后需重启实例生效。", ++ "type": "continuous", ++ "range": [ ++ 2097152, ++ 2147483647 ++ ], ++ "dtype": "Integer", ++ "default_value": 696320000 ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/mysql.json b/copilot-tune/src/knowledge_base/knob_params/mysql.json +new file mode 100644 +index 0000000..df04b95 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/mysql.json +@@ -0,0 +1,621 @@ ++{ ++ "innodb_adaptive_flushing": { ++ "desc": "该参数指定是否根据工作负载动态调整InnoDB缓冲池中脏页的刷新速率。启用自适应刷新机制可以提高性能,并缓解磁盘I/O瓶颈。取值为ON时启用自适应刷新,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ }, ++ "innodb_adaptive_flushing_lwm": { ++ "desc": "定义了自适应刷新启用时的低水位线,表示重做日志容量的百分比。增大此值可以提高写入性能,但可能会增加恢复时间,能够缓解磁盘IO瓶颈。默认值为10。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 70 ++ ], ++ "dtype": "int", ++ "default_value": 10 ++ }, ++ "innodb_adaptive_hash_index": { ++ "desc": "自适应哈希索引是一种机制,允许InnoDB根据查询模式动态创建哈希索引,以加速查询。它可以缓解CPU瓶颈,提升查询性能。该参数取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ }, ++ "innodb_adaptive_hash_index_parts": { ++ "desc": "InnoDB自适应哈希索引的分区数量。增大此值可以提高哈希索引的性能,但可能会增加内存使用。可以缓解内存瓶颈。默认值为8,最大值为512,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 512 ++ ], ++ "dtype": "int", ++ "default_value": 8 ++ }, ++ "innodb_adaptive_max_sleep_delay": { ++ "desc": "该参数允许InnoDB根据当前工作负载自动调整innodb_thread_sleep_delay的值。它定义了InnoDB在自适应最大睡眠延迟期间的最大延迟时间(以微秒为单位),可以帮助缓解CPU瓶颈,尤其是在高并发情况下。增大该值可以使InnoDB在等待锁时更长时间地休眠,从而减少CPU的使用率,减小该值则会使InnoDB更快地尝试获取锁。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000000 ++ ], ++ "dtype": "int", ++ "default_value": 150000 ++ }, ++ "innodb_api_bk_commit_interval": { ++ "desc": "该参数定义了InnoDB memcached接口中空闲连接的自动提交时间间隔(以秒为单位)。增大该值可以减少提交操作的频率,从而帮助缓解CPU瓶颈,减小该值则会增加提交操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1073741824 ++ ], ++ "dtype": "int", ++ "default_value": 5 ++ }, ++ "innodb_api_disable_rowlock": { ++ "desc": "该参数用于禁用InnoDB API在执行DML操作时的行锁定。设置为ON时禁用行锁定,设置为OFF时启用行锁定。禁用行锁定可能会导致性能下降,但在某些情况下可以帮助缓解CPU瓶颈。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ }, ++ "innodb_buffer_pool_chunk_size": { ++ "desc": "该参数定义了InnoDB缓冲池的每个块的大小(以字节为单位)。增大该值可以减少块的数量,从而减少管理开销,减小该值则会增加块的数量,可能会提高并发性能。该参数对于缓解内存瓶颈非常重要。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ "innodb_buffer_pool_size / innodb_buffer_pool_instances" ++ ], ++ "dtype": "int", ++ "default_value": 134217728 ++ }, ++ "innodb_buffer_pool_dump_at_shutdown": { ++ "desc": "指示是否在关闭时转储缓冲池的内容。启用此选项可以提高下次启动时的性能。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF" ++ ], ++ "dtype": "string", ++ "default_value": "ON" ++ }, ++ "innodb_buffer_pool_dump_now": { ++ "desc": "该参数用于立即转储InnoDB缓冲池的内容到磁盘。它通常与参数innodb_buffer_pool_load_now结合使用,能够帮助缓解磁盘IO瓶颈,尤其是在重启后需要快速恢复数据时。取值为ON时表示立即转储,OFF时表示不转储。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "Boolean", ++ "default_value": "OFF" ++ }, ++ "innodb_buffer_pool_dump_pct": { ++ "desc": "该参数定义了在缓冲池转储时要转储的页面百分比。它指定了每个缓冲池中最近使用页面的百分比,以便在重启时更快地恢复数据。增大该值可以加快重启时的数据恢复速度,而减小该值则会减少转储的数据量,从而有助于缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "dtype": "Integer", ++ "default_value": 25 ++ }, ++ "innodb_buffer_pool_instances": { ++ "desc": "设置缓冲池实例的数量,允许多个缓冲池实例并行处理请求,以提高性能。可以缓解内存瓶颈。建议在缓冲池大小大于1GB时使用多个实例。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "Integer", ++ "default_value": 8 ++ }, ++ "innodb_buffer_pool_in_core_file": { ++ "desc": "该参数指示是否将InnoDB缓冲池的内容写入核心转储文件。禁用该参数可以减少核心文件的大小,适用于故障排除。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "Boolean", ++ "default_value": "ON" ++ }, ++ "innodb_buffer_pool_load_at_startup": { ++ "desc": "指示在MySQL服务器启动时,InnoDB缓冲池是否自动加载之前持有的页面,以提高启动性能。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF" ++ ], ++ "dtype": "string", ++ "default_value": "OFF" ++ }, ++ "innodb_buffer_pool_load_now": { ++ "desc": "该参数用于立即加载InnoDB缓冲池的内容,帮助缓解磁盘IO瓶颈,尤其在重启后需要快速恢复数据时。取值为ON时表示立即加载,OFF时表示不加载。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ }, ++ "innodb_buffer_pool_size": { ++ "desc": "设置InnoDB缓冲池的大小,缓冲池用于缓存数据和索引,以提高性能。增大该值可以减少磁盘I/O,提高读取性能,建议设置为系统内存的70%-80%。这是InnoDB最重要的参数之一,直接影响数据库的性能和响应速度。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ { ++ "64-bit": "18446744073709551615 (2^64-1)", ++ "32-bit": "4294967295 (2^32-1)" ++ } ++ ], ++ "dtype": "int", ++ "default_value": 134217728 ++ }, ++ "innodb_change_buffering": { ++ "desc": "该参数控制InnoDB如何缓冲对辅助索引的更改,优化写入操作,能够缓解磁盘I/O瓶颈。可选值包括:none(禁用)、inserts(仅插入)、deletes(仅删除)、changes(更改)、purges(清除)和all(全部)。", ++ "type": "discrete", ++ "range": [ ++ "none", ++ "inserts", ++ "deletes", ++ "changes", ++ "purges", ++ "all" ++ ], ++ "dtype": "string", ++ "default_value": "all" ++ }, ++ "innodb_change_buffer_max_size": { ++ "desc": "该参数定义了InnoDB更改缓冲区的最大大小(以百分比表示),作为缓冲池总大小的一个百分比。增大该值可以提高写入性能,帮助缓解磁盘IO瓶颈,而减小该值则会限制更改缓冲区的大小,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 50 ++ ], ++ "dtype": "int", ++ "default_value": 25 ++ }, ++ "innodb_checksum_algorithm": { ++ "desc": "指定如何生成和验证存储在InnoDB表空间磁盘块中的校验和,以确保数据完整性。可以缓解CPU瓶颈。可选值包括crc32、strict_crc32、innodb、strict_innodb、none和strict_none,建议使用crc32以获得较好的性能和安全性。", ++ "type": "discrete", ++ "range": [ ++ "crc32", ++ "strict_crc32", ++ "innodb", ++ "strict_innodb", ++ "none", ++ "strict_none" ++ ], ++ "dtype": "string", ++ "default_value": "crc32" ++ }, ++ "innodb_commit_concurrency": { ++ "desc": "该参数控制同时提交的事务数量,允许多个线程并发提交以提高性能。适当调整此参数可以缓解CPU瓶颈,建议根据系统负载进行调整,通常设置为8或16。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ }, ++ "innodb_compression_level": { ++ "desc": "指定用于InnoDB压缩表和索引的zlib压缩级别。该参数控制数据压缩的强度,可以缓解磁盘IO瓶颈。取值范围为0(无压缩)到9(最大压缩),增大该值会增加CPU使用率,但可以减少磁盘空间的使用。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 9 ++ ], ++ "dtype": "int", ++ "default_value": 6 ++ }, ++ "innodb_compression_pad_pct_max": { ++ "desc": "该参数定义了InnoDB压缩填充的最大百分比。它指定每个压缩页面中可以保留的最大空闲空间百分比。增大该值可以提高压缩效率,减小该值则可能会导致压缩效率下降,从而帮助缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 75 ++ ], ++ "dtype": "int", ++ "default_value": 50 ++ }, ++ "innodb_concurrency_tickets": { ++ "desc": "该参数定义了InnoDB的并发票据数,决定了可以同时进入InnoDB的线程数量。增大该值可以提高并发性能,帮助缓解CPU瓶颈,而减小该值则可能导致性能下降。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 5000 ++ }, ++ "innodb_ddl_threads": { ++ "desc": "该参数定义了InnoDB在执行DDL操作时可以使用的最大并行线程数。增大该值可以提高DDL操作的并发性,帮助缓解CPU瓶颈,而减小该值则可能导致性能下降。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ }, ++ "innodb_default_row_format": { ++ "desc": "该参数定义了InnoDB表和用户创建的临时表的默认行格式。选择合适的行格式可以优化存储和性能,帮助缓解磁盘IO瓶颈。可选的行格式包括DYNAMIC、COMPRESSED、REDUNDANT和COMPACT。", ++ "type": "discrete", ++ "range": [ ++ "REDUNDANT", ++ "COMPACT", ++ "DYNAMIC", ++ "COMPRESSED" ++ ], ++ "dtype": "enumeration", ++ "default_value": "DYNAMIC" ++ }, ++ "innodb_disable_sort_file_cache": { ++ "desc": "该参数用于禁用操作系统文件系统缓存,以便在进行合并排序的临时文件时提高性能。启用该参数(值为ON)可以帮助缓解磁盘IO瓶颈,而禁用缓存(值为OFF)可能会导致性能下降。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ }, ++ "innodb_doublewrite": { ++ "desc": "控制双写缓冲机制,以提高数据安全性。可以缓解磁盘IO瓶颈。MySQL 8.0.30及以上版本支持多种设置,取值为ON时启用,OFF时禁用,DETECT_AND_RECOVER和DETECT_ONLY用于检测和恢复。建议保持启用以防止数据丢失。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF", ++ "DETECT_AND_RECOVER", ++ "DETECT_ONLY" ++ ], ++ "dtype": "enumeration", ++ "default_value": "ON" ++ }, ++ "innodb_fast_shutdown": { ++ "desc": "控制InnoDB的快速关闭行为,影响关闭时的性能和数据安全性。可以缓解磁盘IO瓶颈。取值为0(完全关闭)、1(快速关闭)、2(最小关闭),建议使用1以平衡性能和安全性。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 1, ++ 2 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ }, ++ "innodb_file_per_table": { ++ "desc": "启用每个表使用独立的表空间文件,以提高管理灵活性和性能,能够缓解磁盘IO瓶颈。当取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ }, ++ "innodb_flushing_avg_loops": { ++ "desc": "该参数定义了InnoDB在平均循环中进行刷新操作的次数。它控制InnoDB保持之前计算的刷新状态快照的迭代次数,从而影响自适应刷新对变化工作负载的响应速度。增大该值可以减少刷新操作的频率,可能有助于缓解磁盘IO瓶颈,而减小该值则会增加刷新操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 30 ++ }, ++ "innodb_flush_log_at_trx_commit": { ++ "desc": "控制事务提交时日志的刷新策略,影响数据安全性和性能。取值为0(每秒刷新一次)、1(每次提交刷新)、2(每次提交写入内存)。建议根据数据安全性需求进行选择,适当调整可以缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 1, ++ 2 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ }, ++ "innodb_flush_method": { ++ "desc": "定义用于将数据刷新到InnoDB数据文件和日志文件的方法,这会影响I/O吞吐量。可以缓解磁盘I/O瓶颈。可选值包括fsync、O_DSYNC、littlesync、nosync、O_DIRECT、O_DIRECT_NO_FSYNC、unbuffered和normal。建议根据存储设备选择合适的刷新方法。", ++ "type": "discrete", ++ "range": [ ++ "fsync", ++ "O_DSYNC", ++ "littlesync", ++ "nosync", ++ "O_DIRECT", ++ "O_DIRECT_NO_FSYNC", ++ "unbuffered", ++ "normal" ++ ], ++ "dtype": "String", ++ "default_value": [ ++ "fsync", ++ "unbuffered" ++ ] ++ }, ++ "innodb_force_recovery": { ++ "desc": "该参数用于设置InnoDB的强制恢复模式,通常在严重故障排除情况下更改。它可以帮助在崩溃后恢复数据,主要影响磁盘IO瓶颈。取值范围为0到6,其中0表示正常模式,1到6表示不同级别的恢复模式,数值越大,恢复的限制越多。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 6 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ }, ++ "innodb_ft_max_token_size": { ++ "desc": "该参数定义了InnoDB全文索引中最大令牌大小。它决定了可以被索引的单词的最大字符长度。增大该值可以允许更长的单词被索引,从而可能帮助缓解CPU瓶颈,减小该值则会限制索引的单词长度。", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 84 ++ ], ++ "dtype": "int", ++ "default_value": 84 ++ }, ++ "innodb_idle_flush_pct": { ++ "desc": "该参数限制了InnoDB在空闲时的页面刷新百分比。增大该值可以提高空闲时的刷新频率,从而帮助缓解磁盘IO瓶颈;减小该值则会减少空闲时的刷新频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 100 ++ }, ++ "innodb_io_capacity": { ++ "desc": "innodb_io_capacity变量定义了InnoDB后台任务可用的每秒I/O操作次数(IOPS)。该参数影响缓冲区刷新脏页的速率,建议根据存储设备的性能进行调整,通常设置为SAS 200,SSD 5000,PCI-E 10000-50000。", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 200 ++ }, ++ "innodb_io_capacity_max": { ++ "desc": "innodb_io_capacity_max变量定义了InnoDB后台任务执行的最大I/O操作次数(IOPS)。该参数设置InnoDB的最大I/O容量,影响高负载情况下的I/O速率,可以缓解磁盘I/O瓶颈。建议根据存储设备的性能进行调整,通常设置为1000-20000。", ++ "type": "continuous", ++ "range": [ ++ 100, ++ "18446744073709551615" ++ ], ++ "dtype": "int", ++ "default_value": "2 * innodb_io_capacity, min of 2000" ++ }, ++ "innodb_lock_wait_timeout": { ++ "desc": "InnoDB事务在放弃之前等待行锁的时间(以秒为单位)。默认值为50秒。该参数影响事务的锁等待行为,可以缓解CPU瓶颈,建议根据应用需求进行调整。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1073741824 ++ ], ++ "dtype": "int", ++ "default_value": 50 ++ }, ++ "innodb_log_buffer_size": { ++ "desc": "InnoDB日志缓冲区的大小,单位为字节。增大此值可以提高写入性能,尤其是在高事务负载的情况下,能够缓解磁盘IO瓶颈。默认值为16MB,最小值为1KB,最大值为4GB。", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ }, ++ "innodb_log_files_in_group": { ++ "desc": "InnoDB日志文件组中的日志文件数量。增加此值可以提高并发写入性能,并缓解磁盘IO瓶颈。默认值为2,最小值为2,最大值为100。", ++ "type": "discrete", ++ "range": [ ++ 2, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 2 ++ }, ++ "innodb_log_file_size": { ++ "desc": "每个InnoDB日志文件的大小。增大此值可以提高恢复性能,尤其是在大事务的情况下。可以缓解磁盘IO瓶颈。默认值为48MB,最大值为512GB,最小值为1MB。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ "512GB / innodb_log_files_in_group" ++ ], ++ "dtype": "int", ++ "default_value": 50331648 ++ }, ++ "innodb_max_dirty_pages_pct": { ++ "desc": "InnoDB允许的最大脏页百分比。该参数控制InnoDB在缓冲池中允许的脏页的比例,默认值为90%。增大此值可以减少写入操作,但可能会增加恢复时间。建议在内存分配过大导致Swap占用严重时适当减小此值,以释放Swap空间。过大的值会导致每次更新需要交换的数据页过多,而过小的值则可能导致更新操作变慢。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 90 ++ }, ++ "innodb_max_dirty_pages_pct_lwm": { ++ "desc": "该参数定义了InnoDB允许的最小脏页百分比,表示在启用预刷写以控制脏页比例时的低水位线。增大该值可以减少写入操作的频率,减小该值则会增加写入操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 99.999 ++ ], ++ "dtype": "Numeric", ++ "default_value": 10 ++ }, ++ "innodb_max_purge_lag": { ++ "desc": "定义了期望的最大清除延迟。如果超过此值,将对INSERT、UPDATE和DELETE操作施加延迟。增大此值可以减少清除操作的频率,但可能会导致内存使用增加,从而缓解内存瓶颈。默认值为0,最大值为4294967295,最小值为0。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "Integer", ++ "default_value": 0 ++ }, ++ "innodb_old_blocks_pct": { ++ "desc": "指定InnoDB缓冲池中用于旧块子列表的近似百分比。增大此值可以增加旧块的使用,可能会提高查询性能,并缓解内存瓶颈。默认值为37,最大值为95,最小值为5。", ++ "type": "continuous", ++ "range": [ ++ 5, ++ 95 ++ ], ++ "dtype": "int", ++ "default_value": 37 ++ }, ++ "innodb_old_blocks_time": { ++ "desc": "指定一个块在被插入到旧子列表后,首次访问后必须保持在该子列表中的时间(以毫秒为单位),在此时间到达后,该块可以被移动到新子列表。增大此值可以增加旧块的保留时间,可能会提高查询性能,并缓解内存瓶颈。默认值为1000毫秒。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 1000 ++ }, ++ "innodb_open_files": { ++ "desc": "指定InnoDB可以同时打开的最大文件数。增大此值可以提高并发性能,尤其是在高负载情况下,并且可以缓解内存瓶颈。默认值为300。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 65535 ++ ], ++ "dtype": "int", ++ "default_value": 300 ++ }, ++ "innodb_page_size": { ++ "desc": "指定InnoDB表空间的页面大小,影响存储效率和性能。可以缓解磁盘I/O瓶颈。可选值为4096、8192、16384、32768和65536字节。", ++ "type": "discrete", ++ "range": [ ++ 4096, ++ 8192, ++ 16384, ++ 32768, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 16384 ++ }, ++ "innodb_parallel_read_threads": { ++ "desc": "定义可以用于并行聚簇索引读取的线程数量。增大此值可以提高读取性能,尤其是在多核系统上,能够缓解CPU瓶颈。默认值为4,最小值为1,最大值为256。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 256 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ }, ++ "innodb_read_ahead_threshold": { ++ "desc": "控制InnoDB使用的线性预读的敏感性,以便将页面预取到缓冲池中。增大此值可以提高顺序读取性能,但可能会增加随机读取的延迟,从而缓解磁盘IO瓶颈。默认值为56,最大值为64,最小值为0。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 56 ++ }, ++ "innodb_read_io_threads": { ++ "desc": "InnoDB读取操作的I/O线程数量。增大此值可以提高读取性能,尤其是在高并发情况下,能够缓解磁盘I/O瓶颈。默认值为4,最大值为64,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ }, ++ "innodb_sync_array_size": { ++ "desc": "定义了互斥锁/锁等待数组的大小。增大此值可以提高高并发工作负载下的线程协调能力,尤其是在等待线程数量较多的情况下。此设置必须在MySQL实例启动时配置,之后无法更改。增大此值可以提高写入性能,尤其是在高负载情况下,能够缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1024 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ }, ++ "innodb_table_locks": { ++ "desc": "如果 `autocommit = 0`,`InnoDB` 会遵循 `LOCK TABLES`;MySQL 不会在 `LOCK TABLES ... WRITE` 之前返回,直到所有其他线程释放了对该表的所有锁。启用或禁用InnoDB表锁。启用后可以提高并发性能,缓解CPU瓶颈。取值为ON/OFF,默认值为ON。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ }, ++ "innodb_thread_concurrency": { ++ "desc": "该参数定义了允许进入InnoDB的最大线程数。值为0(默认值)表示无限并发(无上限)。在高并发系统中,此变量用于性能调优。推荐设置为服务器CPU核心数的2倍,以优化并发操作。适当的设置可以提高CPU多核处理能力和并发量,但过高的值可能导致上下文切换增加,从而影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ }, ++ "innodb_thread_sleep_delay": { ++ "desc": "InnoDB线程休眠延迟,表示InnoDB线程在加入InnoDB队列之前的休眠时间,单位为微秒。增大此值可以减少CPU使用率,但可能会影响响应时间,适用于缓解CPU瓶颈。默认值为10000微秒,最大值为4294967295微秒,最小值为0微秒。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 10000 ++ }, ++ "innodb_write_io_threads": { ++ "desc": "InnoDB写入操作的I/O线程数量。增大此值可以提高写入性能,尤其是在高并发情况下,能够缓解磁盘I/O瓶颈。默认值为4,最大值为64,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ }, ++ "max_heap_table_size": { ++ "desc": "This variable sets the maximum size to which user-created MEMORY tables are permitted to grow. It can reduce disk I/O by keeping temporary tables in memory.", ++ "type": "continuous", ++ "range": [ ++ 16384, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ }, ++ "thread_cache_size": { ++ "desc": "The number of threads cached to handle new connections. This variable can be increased to improve performance if you have a lot of new connections. It reduces CPU overhead from thread creation/destruction. Ideally set high enough so that most new connections use cached threads, but higher values consume more memory.", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 16384 ++ ], ++ "dtype": "int", ++ "default_value": 9 ++ }, ++ "tmp_table_size": { ++ "desc": "Defines the maximum size of internal in-memory temporary tables created by the MEMORY storage engine and, as of MySQL 8.0.28, the TempTable storage engine. If the size of a temporary table exceeds this value, it is converted to a disk-based table, which can alleviate memory and disk I/O. Higher values reduce disk usage but increase memory consumption.", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/nginx.json b/copilot-tune/src/knowledge_base/knob_params/nginx.json +new file mode 100644 +index 0000000..6c186d2 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/nginx.json +@@ -0,0 +1,420 @@ ++{ ++ "access_log": { ++ "desc": "设置访问日志的路径和格式。合理配置可以缓解磁盘IO瓶颈,优化日志记录,提高日志的可读性和分析效率。可以在同一配置级别上指定多个日志。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "off", ++ "logs/access.log combined" ++ ] ++ }, ++ "aio_write": { ++ "desc": "aio_write参数用于指定是否在启用异步IO(aio)时用于写入文件。设置为on时启用异步写入,可以缓解磁盘IO瓶颈;设置为off时禁用异步写入。该参数适用于需要优化文件写入性能的场景。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "client_body_buffer_size": { ++ "desc": "设置客户端请求体的缓冲区大小。该参数用于定义读取客户端请求体的缓冲区大小。如果请求体大于缓冲区,整个请求体或其部分将被写入临时文件。增大该值可以处理更大的请求体,但会增加内存使用,可能有助于缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ "8k", ++ "16k" ++ ] ++ }, ++ "client_header_buffer_size": { ++ "desc": "设置客户端请求头的缓冲区大小。对于大多数请求,1K字节的缓冲区足够。增大该值可以处理更大的请求头,但会增加内存使用,适用于缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ "1k", ++ "8k" ++ ] ++ }, ++ "directio_alignment": { ++ "desc": "设置直接I/O的对齐字节数。该参数用于优化磁盘I/O性能,增大该值可以提高对齐性能,适合大文件的处理,而减小该值则更适合小文件的处理。通过合理设置该参数,可以缓解磁盘I/O瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 512, ++ 4096 ++ ] ++ }, ++ "gzip": { ++ "desc": "启用或禁用响应内容的 gzip 压缩,可以提高网络传输效率,缓解网络瓶颈。取值为 on(启用)或 off(禁用)。启用后,Nginx 会对响应内容进行压缩,减少传输数据量。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "gzip_buffers": { ++ "desc": "设置用于压缩响应的缓冲区数量和大小。增大该值可以提高Gzip压缩性能,但会占用更多内存,适用于缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "16 8k", ++ "8 16k" ++ ] ++ }, ++ "gzip_comp_level": { ++ "desc": "设置响应的 gzip 压缩级别,范围从 1 到 9。增大该值可以提高压缩率,但会增加 CPU 负担,减小则会降低压缩率但提高速度。该参数可以缓解网络瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 9 ++ ] ++ }, ++ "gzip_http_version": { ++ "desc": "设置支持的 Gzip 压缩的 HTTP 版本,通常为 1.0 或 1.1。选择 1.1 可以支持更多的特性,但可能会影响兼容性。该参数可以帮助缓解网络瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "1.0", ++ "1.1" ++ ] ++ }, ++ "gzip_min_length": { ++ "desc": "设置要进行 Gzip 压缩的响应内容的最小长度,单位为字节。该长度由 “Content-Length” 响应头字段决定。增大该值可以避免对小文件进行压缩,从而节省 CPU 资源,并缓解网络瓶颈。建议设置为 20 字节以上,以确保有效的压缩。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 20, ++ 8192 ++ ] ++ }, ++ "gzip_vary": { ++ "desc": "启用或禁用插入“Vary: Accept-Encoding”响应头字段。该设置可以帮助代理服务器根据客户端的压缩支持情况缓存不同的响应,从而缓解网络瓶颈。启用后,响应头中将包含Vary: Accept-Encoding字段,允许更好的缓存管理和内容分发。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "keepalive_disable": { ++ "desc": "用于禁用某些用户代理的 keep-alive 功能,以防止与表现不佳的浏览器建立持久连接。可以缓解内存瓶颈。通过 `browser` 参数指定受影响的浏览器,多个值用空格分隔。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "msie6", ++ "safari", ++ "none" ++ ] ++ }, ++ "keepalive_requests": { ++ "desc": "设置可以通过一个 keep-alive 连接提供服务的最大请求数。该参数可以缓解内存瓶颈,增大该值可以提高连接的复用率,但过高可能导致资源占用过多。适当调整该值以优化性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 10000 ++ ] ++ }, ++ "keepalive_time": { ++ "desc": "限制在关闭连接之前,keep-alive 连接保持的最大时间。增大该值可以减少连接的关闭频率,从而缓解内存瓶颈,但可能导致资源占用增加。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "1h", ++ "2h" ++ ] ++ }, ++ "keepalive_timeout": { ++ "desc": "keepalive_timeout参数设置客户端保持连接的超时时间,超过该时间后,连接将被关闭。该参数可以缓解内存瓶颈,增大该值可以减少频繁的连接建立和关闭。默认值为75秒。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "30s", ++ "60s", ++ "90s", ++ "120s" ++ ] ++ }, ++ "limit_rate_after": { ++ "desc": "设置在传输开始后,达到限制速率的时间,单位为字节。该参数用于控制在响应传输给客户端后,达到限制速率之前的初始传输量。增大该值可以让用户在开始时享受更高的下载速度,之后再限制带宽,从而缓解网络瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1048576 ++ ] ++ }, ++ "lingering_close": { ++ "desc": "控制Nginx关闭客户端连接的方式。该参数可以设置为'on'、'off'或'always'。当设置为'on'时,Nginx会在关闭连接时等待一段时间,以便客户端可以完成数据传输,从而缓解网络瓶颈。设置为'off'时,连接会立即关闭,而设置为'always'则会在每次关闭连接时都进行延迟处理。此参数的合理配置可以提高服务器的性能和用户体验。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "off", ++ "on", ++ "always" ++ ] ++ }, ++ "lingering_time": { ++ "desc": "指定在关闭连接时,nginx 处理来自客户端的额外数据的最大时间。增大该值可以提高连接关闭的稳定性,减小则适合高并发场景,能够缓解网络瓶颈。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "0", ++ "120s" ++ ] ++ }, ++ "lingering_timeout": { ++ "desc": "指定在关闭连接时等待更多客户端数据到达的最大时间。增大该值可以提高连接关闭的稳定性,减小则适合高并发场景。该参数可以缓解网络瓶颈。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "0", ++ "120s" ++ ] ++ }, ++ "open_file_cache": { ++ "desc": "配置一个缓存,用于存储打开的文件描述符、文件大小和修改时间、目录的存在信息以及文件查找错误。通过启用该参数,可以提高Nginx的文件访问性能,减少磁盘I/O瓶颈。该参数的取值可以是OFF(关闭缓存)、ON(开启缓存)或一个具体的缓存大小。开启缓存后,Nginx会在内存中缓存打开的文件句柄,减少对文件系统的访问。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "off", ++ "max=N inactive=time" ++ ] ++ }, ++ "open_file_cache_errors": { ++ "desc": "该参数用于启用或禁用对open_file_cache的文件查找错误的缓存。通过缓存错误,可以减少因文件错误导致的频繁打开文件操作,从而缓解磁盘IO瓶颈。建议在需要频繁访问文件的场景中使用。设置为off时,禁用该功能。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "open_file_cache_min_uses": { ++ "desc": "设置在缓存中保留文件句柄的最小使用次数。该参数定义了在open_file_cache指令的inactive参数配置的时间段内,文件被访问的最小次数。增大该值可以使得不常用的文件句柄被更快地移除,从而释放内存,缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 10 ++ ] ++ }, ++ "open_file_cache_valid": { ++ "desc": "设置缓存文件句柄的有效时间,单位为秒。增大该值可以减少文件句柄的重新打开频率,从而缓解磁盘IO瓶颈,降低系统负担。建议根据实际需求调整该值,以优化性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 3600 ++ ] ++ }, ++ "output_buffers": { ++ "desc": "设置用于从磁盘读取响应的缓冲区数量和大小。增大该值可以提高并发性能,适合高流量场景;减小则适合低流量场景。该参数可以缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "2 32k" ++ ] ++ }, ++ "postpone_output": { ++ "desc": "该参数设置在发送响应之前可以推迟的字节数。如果可能,客户端数据的传输将被推迟,直到nginx至少有指定字节数的数据可发送。增大该值可以提高网络利用率,适合高带宽场景;减小该值则适合低延迟场景,以缓解网络瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1048576 ++ ] ++ }, ++ "proxy_buffering": { ++ "desc": "启用或禁用从代理服务器的响应缓冲。启用后可以提高性能,但会增加内存使用,适用于缓解内存瓶颈。取值为 ON 时启用,OFF 时禁用。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "proxy_buffers": { ++ "desc": "设置用于读取来自代理服务器响应的缓冲区数量和大小。该参数可以帮助缓解内存瓶颈,增大该值可以处理更大的响应,但会增加内存使用。建议根据实际需求进行调整。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "8 4k", ++ "8 8k" ++ ] ++ }, ++ "proxy_buffer_size": { ++ "desc": "设置用于读取从代理服务器接收到的响应的第一部分的缓冲区大小。增大该值可以处理更大的响应,有助于缓解内存瓶颈,但会增加内存使用。建议根据实际需求进行调整。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "4k", ++ "1024k" ++ ] ++ }, ++ "proxy_busy_buffers_size": { ++ "desc": "当启用从代理服务器缓冲响应时,限制可以忙于向客户端发送响应的缓冲区的总大小。增大该值可以提高性能,但会增加内存使用。建议根据实际情况进行调整,以缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "4k", ++ "1024k" ++ ] ++ }, ++ "proxy_cache_min_uses": { ++ "desc": "设置缓存最小使用次数。该参数用于控制缓存内容的有效性,增大该值可以确保只有被频繁请求的内容才会被缓存,从而缓解磁盘IO瓶颈,减小该值则可能导致不常用内容被缓存。该参数的合理设置可以提高缓存的效率和性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 10 ++ ] ++ }, ++ "proxy_cache_use_stale": { ++ "desc": "该参数决定在与代理服务器通信时,何种情况下可以使用过期的缓存响应。启用此选项可以缓解磁盘IO瓶颈,减少对后端的请求,从而提高响应速度,但可能返回过期的数据。可选值包括:error、timeout、invalid_header、updating、http_500、http_502、http_503、http_504、http_403、http_404、http_429和off。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "error", ++ "timeout", ++ "invalid_header", ++ "updating", ++ "http_500", ++ "http_502", ++ "http_503", ++ "http_504", ++ "http_403", ++ "http_404", ++ "http_429", ++ "off" ++ ] ++ }, ++ "proxy_connect_timeout": { ++ "desc": "定义与代理服务器建立连接的超时时间,单位为秒。该超时时间可以帮助缓解网络瓶颈,增大该值可以避免因网络延迟导致的连接失败,而减小该值则可以加快失败检测。请注意,该超时时间通常不能超过75秒。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 1, ++ 75 ++ ] ++ }, ++ "proxy_ignore_client_abort": { ++ "desc": "确定当客户端关闭连接时是否关闭与代理服务器的连接。设置为ON时,代理服务器将忽略客户端的中止请求,确保后端继续处理请求,从而提高后端处理的稳定性,缓解网络瓶颈。设置为OFF时,代理服务器将不忽略客户端的中止请求。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "proxy_max_temp_file_size": { ++ "desc": "设置允许的最大临时文件大小,以处理不适合缓冲区的响应。增大该值可以缓解磁盘IO瓶颈,处理更大的响应,但会增加磁盘使用。建议根据实际需求进行调整。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "0", ++ "1024m" ++ ] ++ }, ++ "proxy_read_timeout": { ++ "desc": "定义从代理服务器读取后端服务器响应的超时时间。该参数可以帮助缓解网络瓶颈,防止长时间未响应的连接占用资源。取值为时间,单位为秒,增大该值可以处理更慢的后端响应,减小则可以更快释放资源。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ "1s", ++ "120s" ++ ] ++ }, ++ "proxy_request_buffering": { ++ "desc": "启用或禁用客户端请求体的缓冲。启用缓冲可以提高性能,减少内存使用,适用于高并发场景。禁用缓冲可能会减少内存使用,但可能会影响性能。取值为ON或OFF,ON表示启用缓冲,OFF表示禁用缓冲。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "proxy_send_timeout": { ++ "desc": "设置代理服务器向客户端发送响应的超时时间。该参数用于控制向被代理服务器发送请求的超时时间,能够有效缓解网络瓶颈,防止长时间未响应的连接占用资源。取值为时间,单位为秒,增大该值可以处理更慢的客户端连接,减小则可以更快释放资源。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "1s", ++ "120s" ++ ] ++ }, ++ "proxy_temp_file_write_size": { ++ "desc": "设置写入临时文件的缓冲区大小。增大该值可以提高性能,尤其是在高负载情况下,有助于缓解磁盘IO瓶颈,但会增加磁盘使用。建议根据实际情况进行调整,以优化性能和资源使用。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "64k", ++ "1024k" ++ ] ++ }, ++ "read_ahead": { ++ "desc": "设置读取前的预读字节数。该参数用于在处理文件时,内核会提前读取指定数量的字节,以缓解磁盘IO瓶颈。增大该值可以提高读取性能,适合大文件;减小则适合小文件。通过合理配置该参数,可以优化系统的文件读取效率。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1048576 ++ ] ++ }, ++ "reset_timedout_connection": { ++ "desc": "该参数用于设置是否重置超时的连接和使用非标准代码444关闭的连接。启用此功能可以缓解内存瓶颈,并更快释放资源。取值为 ON 时启用,OFF 时禁用。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "sendfile": { ++ "desc": "启用或禁用sendfile()系统调用,用于高效地将文件内容发送到网络连接。该功能可以显著提高文件传输的效率,减少磁盘I/O瓶颈。取值为ON时启用,OFF时禁用。默认情况下,该参数为OFF。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "sendfile_max_chunk": { ++ "desc": "设置sendfile()系统调用中每次发送的最大字节数。增大该值可以提高大文件传输的效率,减小则适合小文件。该参数可以缓解磁盘I/O瓶颈,优化文件传输性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1048576 ++ ] ++ }, ++ "send_timeout": { ++ "desc": "设置发送响应的超时时间。该参数可以帮助缓解网络瓶颈,增大该值可以提高长连接的稳定性,适合需要长时间保持连接的场景;而减小该值则更适合短连接场景,以提高资源的利用率和响应速度。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ "0", ++ "120s" ++ ] ++ }, ++ "tcp_nodelay": { ++ "desc": "启用或禁用TCP_NODELAY选项。该选项在连接进入保持活动状态时启用。此外,在SSL连接、无缓冲代理和WebSocket代理中也会启用此选项。启用此选项可以缓解网络瓶颈,提升性能。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "tcp_nopush": { ++ "desc": "该参数用于控制在使用sendfile时是否启用TCP_NOPUSH(在FreeBSD上)或TCP_CORK(在Linux上)选项。这些选项可以在发送数据时将多个小数据包合并为一个大数据包,从而减少网络延迟,缓解网络瓶颈。启用时取值为on,禁用时取值为off。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/pgsql.json b/copilot-tune/src/knowledge_base/knob_params/pgsql.json +new file mode 100644 +index 0000000..62824b9 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/pgsql.json +@@ -0,0 +1,448 @@ ++{ ++ "archive_mode": { ++ "desc": "启用或禁用归档模式。当启用 archive_mode 时,已完成的 WAL 段将发送到归档存储。设置为 on 时,WAL 日志将被归档,适合需要数据恢复的场景。开启归档模式会增加磁盘 I/O,但能提高数据安全性。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "off", ++ "on", ++ "always" ++ ] ++ }, ++ "autovacuum_analyze_scale_factor": { ++ "desc": "该参数控制自动分析(autovacuum)在执行分析操作时,表中需要更新的行数与表的总行数的比例。增大该值可以减少自动分析的频率,降低CPU和IO的负担,但可能导致统计信息不准确。减小该值会增加自动分析的频率,有助于保持查询优化器的统计信息更新,缓解memory瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1 ++ ] ++ }, ++ "autovacuum_max_workers": { ++ "desc": "设置自动清理进程的最大工作线程数。增加此值可以提高数据库的自动清理能力,缓解由于死元组导致的磁盘IO瓶颈。取值范围为0到1024,0表示禁用自动清理。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1024 ++ ] ++ }, ++ "autovacuum_vacuum_scale_factor": { ++ "desc": "该参数控制自动清理(autovacuum)在执行清理操作时,表中需要删除的行数与表的总行数的比例。增大该值可以减少自动清理的频率,从而降低CPU和IO的负担,但可能导致表膨胀。减小该值会增加自动清理的频率,有助于及时释放空间,缓解disk IO瓶颈。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1 ++ ] ++ }, ++ "autovacuum_work_mem": { ++ "desc": "指定每个自动清理工作进程可以使用的最大内存量。增大此值可以提高自动清理的效率,缓解内存瓶颈,尤其是在大表的自动清理时。建议根据表的大小和系统资源进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 64, ++ 1073741823 ++ ] ++ }, ++ "bgwriter_delay": { ++ "desc": "设置后台写入器活动轮次之间的延迟时间。增大此值可以减少磁盘IO的频率,适合IO密集型的工作负载。默认值为200毫秒,单位为毫秒,取值范围为0到1000。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1000 ++ ] ++ }, ++ "bgwriter_flush_after": { ++ "desc": "bgwriter_flush_after参数设置后台写入器在写入数据后,尝试强制操作系统将这些写入操作发送到基础存储的阈值。增大此值可以减少磁盘IO的频率,适合IO密集型的工作负载。单位为字节,默认值为512kB,取值范围为0到1GB。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741824 ++ ] ++ }, ++ "bgwriter_lru_maxpages": { ++ "desc": "设置后台写入器每次写入的最大页面数。在每一轮中,后台写入器写入的缓冲区不超过这个数量。增大此值可以提高写入性能,适合内存充足的系统。取值范围为0到10000,0表示禁用此功能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 10000 ++ ] ++ }, ++ "checkpoint_completion_target": { ++ "desc": "设置检查点完成的目标时间比例,作为检查点之间总时间的分数。增大此值可以使检查点过程更加平滑,缓解磁盘IO瓶颈,尤其是在高负载情况下。建议设置为0.5到0.9之间的值。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0.0, ++ 1.0 ++ ] ++ }, ++ "checkpoint_timeout": { ++ "desc": "自动 WAL 检查点之间的最大时间间隔。增大此值可以减少检查点的频率,缓解磁盘 I/O 瓶颈,但可能会增加恢复时间。建议根据系统负载和性能需求进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 30, ++ 86400 ++ ] ++ }, ++ "commit_siblings": { ++ "desc": "在执行 commit_delay 延迟之前,必须有多少个并发打开事务。该参数影响VACUUM操作的触发频率,增大该值可以减少VACUUM的频率,从而降低CPU和磁盘IO的负担。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 100 ++ ] ++ }, ++ "cpu_index_tuple_cost": { ++ "desc": "设置规划器对索引扫描期间处理每个索引条目的成本的估计。增大此值可以影响查询计划的选择,适合CPU密集型的工作负载。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1000 ++ ] ++ }, ++ "cpu_tuple_cost": { ++ "desc": "设置规划器对查询期间处理每一行的成本的估计。增大此值可以影响查询计划的选择,适合CPU密集型的工作负载。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1000 ++ ] ++ }, ++ "deadlock_timeout": { ++ "desc": "这是在检查是否存在死锁条件之前等待锁的时间。增大该值可能会导致死锁检测延迟,但可以减少CPU的消耗。减小该值可以更快地检测到死锁。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 10000 ++ ] ++ }, ++ "effective_cache_size": { ++ "desc": "设置PostgreSQL查询优化器假设的可用缓存内存量。增大此值可以提高查询优化器的决策质量,缓解内存瓶颈,尤其是在复杂查询中。通常建议设置为系统总内存的50%到75%。", ++ "type": "continuous", ++ "dtype": "integer", ++ "range": [ ++ 16, ++ 1073741823 ++ ] ++ }, ++ "effective_io_concurrency": { ++ "desc": "设置预期的并发磁盘I/O操作数量。增大此值可以提高并发I/O操作的性能,缓解磁盘I/O瓶颈,尤其是在高并发的读写操作中。建议根据硬件能力进行调整,值越高,性能越好,但也要考虑系统负载。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1000 ++ ] ++ }, ++ "enable_bitmapscan": { ++ "desc": "启用或禁用查询规划器对位图扫描计划类型的使用。启用此选项可以提高大数据集的查询性能。取值为ON或OFF。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "off", ++ "on" ++ ] ++ }, ++ "enable_hashjoin": { ++ "desc": "启用或禁用查询规划器对哈希连接计划类型的使用。启用此选项可以提高连接查询的性能,特别适合大数据集的连接操作。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "off", ++ "on" ++ ] ++ }, ++ "enable_indexscan": { ++ "desc": "启用或禁用查询规划器对索引扫描和仅索引扫描计划类型的使用。启用此选项可以提高查询性能,适合有索引的表。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "off", ++ "on" ++ ] ++ }, ++ "enable_material": { ++ "desc": "启用或禁用查询规划器对物化(materialization)的使用。启用此选项可以提高复杂查询的性能,适合需要多次访问相同数据的查询。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "off", ++ "on" ++ ] ++ }, ++ "enable_mergejoin": { ++ "desc": "启用或禁用查询规划器对归并连接计划类型的使用。启用此选项可以提高大数据集连接的性能。取值为ON或OFF。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "enable_nestloop": { ++ "desc": "启用或禁用查询规划器对嵌套循环连接计划的使用。启用此选项可以提高小数据集连接的性能。取值为ON或OFF。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "ON", ++ "OFF" ++ ] ++ }, ++ "enable_partition_pruning": { ++ "desc": "启用或禁用查询规划器从查询计划中消除分区表的分区的功能。启用此选项可以提高分区表的查询性能。取值为ON或OFF。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "ON", ++ "OFF" ++ ] ++ }, ++ "enable_seqscan": { ++ "desc": "启用或禁用查询规划器使用顺序扫描计划类型。启用此选项可以提高查询性能,适合小数据集的查询。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "enable_sort": { ++ "desc": "启用或禁用查询规划器使用显式排序步骤。启用此选项可以提高查询性能,适合需要排序的查询。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "fsync": { ++ "desc": "控制数据库在每次写入后是否强制将数据写入磁盘。设置为on时,确保数据的持久性,但会增加磁盘IO。关闭fsync可能会提高性能,但会增加数据丢失的风险。该参数影响检查点和批量数据写入期间的强制写入操作。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "full_page_writes": { ++ "desc": "当该参数开启时,PostgreSQL在修改每个磁盘页面时会将其完整内容写入WAL(Write-Ahead Logging)。这确保在发生崩溃时可以恢复完整页面,适合需要高数据安全性的场景。开启该选项会增加磁盘I/O。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "idle_in_transaction_session_timeout": { ++ "desc": "设置空闲事务会话的超时时间。增大该值可以减少因空闲会话导致的资源浪费,但可能会导致事务延迟。减小该值可以更快地释放资源。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "jit": { ++ "desc": "该参数控制是否启用JIT(Just-In-Time)编译。设置为ON可以提高复杂查询的性能,缓解CPU瓶颈。设置为OFF则会禁用JIT,适用于简单查询或对性能要求不高的场景。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "lock_timeout": { ++ "desc": "设置获取锁的超时时间。增大该值可以减少因锁竞争导致的错误,但可能会导致事务延迟。减小该值可以更快地释放锁。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 10000 ++ ] ++ }, ++ "log_min_duration_statement": { ++ "desc": "设置记录执行时间超过该值的SQL语句的日志。增大该值可以减少日志记录的数量,适合对性能要求较高的场景。减小该值可以更快地发现性能问题。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 10000 ++ ] ++ }, ++ "maintenance_work_mem": { ++ "desc": "指定维护操作(如VACUUM、CREATE INDEX等)可以使用的最大内存量。增大此值可以加速这些操作,缓解内存瓶颈,尤其是在大表的维护时。建议根据维护任务的复杂度进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 64, ++ 1073741823 ++ ] ++ }, ++ "max_connections": { ++ "desc": "确定到数据库服务器的最大并发连接数。增大此值可以支持更多的并发用户,缓解内存瓶颈,但也会增加系统资源的消耗。建议根据系统资源和应用需求进行调整。默认值通常为 100 个连接。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 10000 ++ ] ++ }, ++ "max_locks_per_transaction": { ++ "desc": "该参数定义每个事务可以获得的最大锁数量。增大该值可以支持更多的并发事务,适用于高并发的应用场景,缓解内存瓶颈。减小该值可能导致事务因锁不足而失败。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1024 ++ ] ++ }, ++ "max_parallel_workers": { ++ "desc": "设置系统可以使用的最大并行工作进程数。增大此值可以提高并行处理能力,缓解CPU瓶颈,尤其是在高并发查询中。建议根据系统资源进行调整。", ++ "type": "discrete", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1024 ++ ] ++ }, ++ "max_parallel_workers_per_gather": { ++ "desc": "设置每个 Gather 或 Gather Merge 节点可以启动的最大工作进程数。增大此值可以提高查询性能,缓解 CPU 瓶颈,尤其是在复杂查询中。建议根据查询复杂度和系统资源进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1024 ++ ] ++ }, ++ "max_prepared_transactions": { ++ "desc": "该参数设置可以同时处于 '已准备' 状态的最大事务数。增大该值可以支持更多的分布式事务,适用于需要高并发的应用场景,缓解内存瓶颈。减小该值可能导致预备事务因数量不足而失败。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1024 ++ ] ++ }, ++ "max_wal_size": { ++ "desc": "在自动检查点期间允许 WAL 增长的最大大小。增大此值可以减少检查点的频率,缓解磁盘 I/O 瓶颈,尤其是在高写入负载的情况下。这是一个软限制;在特殊情况下,WAL 大小可能会超过此值。建议根据系统的写入负载进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1073741823 ++ ] ++ }, ++ "random_page_cost": { ++ "desc": "用于估算随机读取的真实成本,相对于顺序存取的成本。增大此值可以影响查询优化器的选择,缓解disk IO瓶颈,尤其是在SSD和HDD之间的选择时。通常SSD的值可以设置为1.1,HDD的值可以设置为4.0。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 1.1, ++ 100 ++ ] ++ }, ++ "seq_page_cost": { ++ "desc": "设置规划器对作为一系列顺序获取的一部分的磁盘页面获取成本的估计。较低的值会使查询优化器更倾向于使用顺序扫描,从而可能缓解磁盘IO瓶颈。增大该值会使优化器更倾向于使用索引扫描。", ++ "type": "continuous", ++ "dtype": "float", ++ "range": [ ++ 0, ++ 1000 ++ ] ++ }, ++ "shared_buffers": { ++ "desc": "设置数据库服务器用于共享内存缓冲区的内存量。增加此值可以提高内存使用效率,缓解内存瓶颈,尤其是在高并发的读写操作中。通常建议设置为系统总内存的25%。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 128, ++ 1073741823 ++ ] ++ }, ++ "statement_timeout": { ++ "desc": "设置SQL语句的超时时间,超过该时间将中止执行。增大该值可以减少因长时间运行的查询被中断的风险,但可能会导致资源占用过久。减小该值可以更快地释放资源。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "synchronous_commit": { ++ "desc": "指定在数据库服务器返回成功指示之前,必须完成多少WAL处理。设置为on可以确保数据安全,但会增加延迟;设置为off可以提高性能,但可能会丢失数据。建议根据数据安全性和性能需求进行调整。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "local", ++ "remote_write", ++ "remote_apply", ++ "on", ++ "off" ++ ] ++ }, ++ "temp_buffers": { ++ "desc": "设置每个数据库会话中用于临时缓冲区的最大内存量。增大该值可以提高临时表的性能,适合需要大量临时表的场景。减小该值可能会导致临时表性能下降。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 2147483647 ++ ] ++ }, ++ "temp_file_limit": { ++ "desc": "指定一个进程可以用于临时文件的最大磁盘空间量。增大该值可以允许更大的临时文件,适合需要处理大数据集的场景。减小该值可以防止临时文件占用过多磁盘空间。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741823 ++ ] ++ }, ++ "vacuum_cost_limit": { ++ "desc": "这是VACUUM操作的成本限制。增大该值可以允许VACUUM操作使用更多的资源,适合需要频繁清理的场景。减小该值可以减少VACUUM对系统性能的影响。默认值为200。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 100000 ++ ] ++ }, ++ "wal_compression": { ++ "desc": "该参数控制是否对WAL(Write Ahead Log)进行压缩。启用WAL压缩可以减少磁盘IO和存储空间的使用,适用于IO瓶颈的场景。禁用WAL压缩则会提高写入性能,但会增加磁盘空间的使用。", ++ "type": "discrete", ++ "dtype": "bool", ++ "range": [ ++ "on", ++ "off" ++ ] ++ }, ++ "wal_level": { ++ "desc": "确定写入WAL(Write Ahead Log)中的信息量,影响数据恢复和复制的能力。可选值为minimal、replica和logical。增大该值会增加磁盘IO的负担,但提供更强的数据保护和复制能力。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "minimal", ++ "replica", ++ "logical" ++ ] ++ }, ++ "work_mem": { ++ "desc": "设置在写入临时磁盘文件之前,查询操作可以使用的内存量。增大此值可以提高复杂查询的性能,缓解内存瓶颈,尤其是在排序和哈希操作中。建议根据查询复杂度和并发量进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 64, ++ 2147483647 ++ ] ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/redis.json b/copilot-tune/src/knowledge_base/knob_params/redis.json +new file mode 100644 +index 0000000..075d303 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/redis.json +@@ -0,0 +1,332 @@ ++{ ++ "active-defrag-cycle-max": { ++ "desc": "最大努力进行碎片整理的CPU百分比,当达到上限阈值时使用。增大该值可以提高碎片整理的效率,但会增加CPU的使用,适用于缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "default_value": 25 ++ }, ++ "active-defrag-cycle-min": { ++ "desc": "该参数用于设置主动碎片整理的最小努力程度,以CPU百分比表示。当达到下限阈值时,将使用此设置。减小该值可以提高碎片整理的频率,但可能会增加CPU的使用,适用于缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1000 ++ ], ++ "default_value": 1 ++ }, ++ "active-defrag-max-scan-fields": { ++ "desc": "在主动碎片整理过程中,最多扫描的字段数量。增大该值可以提高碎片整理的效率,但会增加CPU的使用,适用于缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1000000 ++ ], ++ "default_value": 1000 ++ }, ++ "active-defrag-threshold-lower": { ++ "desc": "主动碎片整理的下限阈值,低于该值时不会进行整理。适当设置可以避免不必要的CPU消耗,适用于内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "default_value": 10 ++ }, ++ "active-defrag-threshold-upper": { ++ "desc": "最大内存碎片率百分比,当内存碎片率高于该值时,Redis将开始进行最大努力的碎片整理,以缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "default_value": 100 ++ }, ++ "activedefrag": { ++ "desc": "启用或禁用主动碎片整理。启用后,Redis会在空闲时自动整理内存,适用于内存碎片严重的场景,可以缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "no" ++ }, ++ "activerehashing": { ++ "desc": "启用主动rehashing后,Redis会在后台自动调整哈希表的大小,以提高性能。该功能每秒进行十次rehash操作,每次操作消耗1毫秒的CPU时间,适用于缓解CPU瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "aof-rewrite-incremental-fsync": { ++ "desc": "该参数决定在AOF重写时是否使用增量fsync。启用增量fsync可以减少写入延迟,适合高写入负载的场景,能够缓解磁盘IO瓶颈。取值为'yes'表示启用,'no'表示不启用。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "dynamic-hz": { ++ "desc": "启用动态事件循环频率调整,Redis会根据连接的客户端数量动态调整事件循环频率,以提高性能,适用于缓解CPU瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "hz": { ++ "desc": "Redis的事件循环频率,单位为赫兹。该参数决定了Redis处理事件的频率,增大该值可以提高事件处理的响应速度,但会增加CPU的使用率,适用于缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 500 ++ ], ++ "default_value": 10 ++ }, ++ "lazyfree-lazy-eviction": { ++ "desc": "控制DEL命令的默认行为,使其像UNLINK一样工作。启用惰性删除后,Redis会在后台异步删除过期键,适用于高并发场景,可以缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "no" ++ }, ++ "lazyfree-lazy-expire": { ++ "desc": "控制过期键的默认行为,使其像UNLINK一样异步处理。启用惰性过期后,Redis将在后台异步处理过期键,适用于高并发场景,可以缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "no" ++ }, ++ "lazyfree-lazy-server-del": { ++ "desc": "控制服务器端删除的行为,使其像UNLINK一样工作。启用后,Redis会在后台异步删除服务器,适用于高并发场景,可以缓解内存瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "no" ++ }, ++ "lfu-decay-time": { ++ "desc": "LFU算法的衰减时间,单位为分钟。增大该值可以延长频率统计的有效期,但会增加内存使用,适用于缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 100000 ++ ], ++ "default_value": 1 ++ }, ++ "lfu-log-factor": { ++ "desc": "LFU(Least Frequently Used)算法的日志因子,决定了LFU算法的灵敏度。增大该值可以提高LFU算法的准确性,但会增加内存使用。适用于缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "default_value": 10 ++ }, ++ "list-compress-depth": { ++ "desc": "压缩深度是指在使用压缩列表时,从列表两侧排除的快速列表 ziplist 节点的数量。增大该值可以减少内存使用,但可能会影响性能,适用于缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 256 ++ ], ++ "default_value": 0 ++ }, ++ "lua-time-limit": { ++ "desc": "Lua脚本的最大执行时间,单位为毫秒。超出该时间将被强制终止,适用于高并发场景,可以缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 5000 ++ ], ++ "default_value": 5000 ++ }, ++ "maxmemory-samples": { ++ "desc": "在执行内存回收时,Redis会随机抽样的键的数量。该参数表示用于最少使用(LRU)和生存时间(TTL)计算的样本大小。增大该值可以提高内存回收的效率,但会增加CPU的使用,适用于缓解CPU瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ 1000 ++ ], ++ "default_value": 3 ++ }, ++ "maxmemory": { ++ "desc": "设置Redis实例可以使用的最大内存量。通过限制Redis使用的内存,可以防止系统内存耗尽,适用于缓解内存瓶颈。该参数可以设置为具体的字节数或使用单位(如kb, mb, gb)。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 9223372036854771712 ++ ], ++ "default_value": 0 ++ }, ++ "proto-max-bulk-len": { ++ "desc": "该参数定义了Redis协议中最大允许的单个数据包的字节数。它的作用是限制客户端发送给Redis的最大数据量,防止过大的数据包导致内存消耗过高,从而缓解内存瓶颈。增大该值可以允许更大的数据包,但可能会增加内存使用和处理时间;减小该值则会限制数据包大小,可能导致客户端发送失败。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ "1mb", ++ "512mb" ++ ], ++ "default_value": "512mb" ++ }, ++ "rdb-save-incremental-fsync": { ++ "desc": "该参数决定在RDB文件保存时是否使用增量fsync。启用增量fsync可以减少写入延迟,适合高写入负载的场景,并能缓解磁盘IO瓶颈。取值为'yes'表示启用,'no'表示不启用。启用时,Redis会在写入RDB文件时进行增量fsync,减少IO压力。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "rdbchecksum": { ++ "desc": "该参数用于设置RDB文件的校验和算法。启用CRC64校验和可以在加载RDB文件时验证数据的完整性,防止数据损坏,从而提高数据安全性。虽然启用校验和会增加一些CPU开销,但可以缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "crc64", ++ "none" ++ ], ++ "default_value": "crc64" ++ }, ++ "rdbcompression": { ++ "desc": "该参数决定在备份RDB文件时是否使用LZF算法压缩字符串。启用压缩可以减少磁盘IO和内存使用,适合存储大量数据的场景。取值为'yes'表示启用压缩,'no'表示不启用。启用压缩会增加CPU负担,但可以显著减少磁盘空间的使用。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "replica-lazy-flush": { ++ "desc": "控制从节点数据的刷新行为。启用惰性刷新后,Redis会在后台异步刷新从节点数据,适用于高并发场景,可以缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "no" ++ }, ++ "replica-serve-stale-data": { ++ "desc": "该参数决定从节点是否可以提供过期数据。启用后,从节点在主节点不可用时仍可继续提供服务,适用于高可用场景,有助于缓解网络瓶颈。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "set-max-intset-entries": { ++ "desc": "该参数定义了Redis中整数集合(intset)允许的最大元素数量。当集合的元素数量超过该值时,Redis会将其转换为更高效的数据结构。增大该值可以允许更多的整数元素,但会增加内存使用;减小该值则会导致更早地转换数据结构,从而缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 2147483647 ++ ], ++ "default_value": 512 ++ }, ++ "slowlog-log-slower-than": { ++ "desc": "该参数用于设置记录慢查询的阈值,单位为微秒。超过该时间的命令将被记录到慢日志中,便于后续分析。可以缓解CPU瓶颈。增大该值可以减少记录的慢查询数量,减小该值则会记录更多的慢查询,有助于性能调优。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 9223372036854775807 ++ ], ++ "default_value": 10000 ++ }, ++ "slowlog-max-len": { ++ "desc": "该参数定义了慢日志的最大长度,即最多记录的慢查询条目数。增大该值可以记录更多的慢查询,减小该值则会限制记录的数量,可能导致重要的慢查询被覆盖。此参数有助于缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 2147483647 ++ ], ++ "default_value": 128 ++ }, ++ "stop-writes-on-bgsave-error": { ++ "desc": "该参数决定在后台保存(bgsave)出现错误时是否停止写入操作。启用该选项可以防止数据丢失,但可能影响可用性。可以缓解磁盘IO瓶颈。取值为'yes'表示启用,'no'表示不启用。启用时,如果bgsave失败,写入操作将被停止,直到问题解决。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "yes", ++ "no" ++ ], ++ "default_value": "yes" ++ }, ++ "stream-node-max-bytes": { ++ "desc": "该参数定义了每个流节点允许的最大字节数。如果设置为0,则节点大小无限制。超过该值的节点将被拆分,增大该值可以允许更大的节点,减小该值则会导致更频繁的节点拆分,可能影响性能。此参数可以帮助缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": 4096 ++ }, ++ "stream-node-max-entries": { ++ "desc": "该参数定义了每个流节点允许的最大条目数。超过该值的节点将被拆分,能够缓解内存瓶颈。增大该值可以允许更多的条目,减小该值则会导致更频繁的节点拆分,可能影响性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 2147483647 ++ ], ++ "default_value": 100 ++ }, ++ "zset-max-ziplist-entries": { ++ "desc": "该参数定义了有序集合(zset)使用ziplist数据结构的最大条目数。超过该值时,zset将转换为更高效的数据结构,从而缓解内存瓶颈。增大该值可以允许更多的条目使用ziplist,减小该值则会导致更早地转换数据结构。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 2147483647 ++ ], ++ "default_value": 128 ++ }, ++ "zset-max-ziplist-value": { ++ "desc": "该参数定义了有序集合(zset)使用ziplist数据结构的最大值大小(字节)。当有序集合中的元素大小小于该值时,将使用一种特殊编码来节省内存。超过该值时,有序集合将转换为更高效的数据结构。增大该值可以允许更大的值使用ziplist,减小该值则会导致更早地转换数据结构,从而缓解内存瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ 1073741824 ++ ], ++ "default_value": 64 ++ } ++} +diff --git a/copilot-tune/src/knowledge_base/knob_params/spark.json b/copilot-tune/src/knowledge_base/knob_params/spark.json +new file mode 100644 +index 0000000..6183b4a +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/spark.json +@@ -0,0 +1,199 @@ ++{ ++ "spark.default.parallelism": { ++ "desc": "RDD 中的默认分区数,由 `join`、`reduceByKey` 和 `parallelize` 等转换返回,当用户未设置时。对于分布式 shuffle 操作(如 reduceByKey 和 join),它取决于父 RDD 中的最大分区数。对于没有父 RDD 的 parallelize 操作,默认值依赖于集群管理器。例如,在本地模式下,默认值为本地机器上的核心数。通常建议将此值设置为集群总核心数的 2 到 4 倍,以提高 CPU 利用率,缓解 CPU 瓶颈。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ] ++ }, ++ "spark.driver.memory": { ++ "desc": "用于驱动程序进程的内存量,影响驱动程序的性能和稳定性。增大此值可以缓解内存瓶颈,尤其是在驱动程序需要处理大量数据时。取值通常为如1g、2g等,具体取值应根据集群资源和任务需求进行调整。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": null ++ }, ++ "spark.driver.memoryOverhead": { ++ "desc": "在集群模式下,每个驱动程序进程要分配的非堆内存量,用于存储非JVM内存的开销,如网络和文件系统的缓冲区。适当增加此值可以缓解由于内存不足导致的性能问题,尤其是在处理大数据集时。单位为MiB,默认值为driverMemory * spark.driver.memoryOverheadFactor。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 384, ++ null ++ ] ++ }, ++ "spark.dynamicAllocation.initialExecutors": { ++ "desc": "如果启用了动态分配,则要运行的初始执行器数量。此参数影响任务启动时的资源分配,增大此值可以提高任务启动速度,缓解CPU瓶颈。取值应根据集群资源和任务需求进行调整,通常为1到数个executor。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "spark.dynamicAllocation.maxExecutors": { ++ "desc": "如果启用了动态分配,则执行器数量的上限。该参数限制了在负载高峰时可用的资源,增大此值可以提高并行度,缓解CPU瓶颈。取值应根据集群资源和任务需求进行调整,通常为数十到数百个executor。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "spark.dynamicAllocation.minExecutors": { ++ "desc": "如果启用了动态分配,则执行器数量的下限。该参数确保在负载较低时仍有一定的资源可用。增大此值可以缓解CPU和内存瓶颈。取值应根据集群资源和任务需求进行调整,通常为1到数个executor。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "spark.executor.cores": { ++ "desc": "每个执行器上要使用的核心数量,影响并行度和任务执行速度。增大此值可以缓解CPU瓶颈,但需确保集群资源足够。通常取值为1到数个核心,具体取值应根据任务并行需求和集群资源进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ] ++ }, ++ "spark.executor.memory": { ++ "desc": "每个执行器进程使用的内存量,通常以字节为单位(如1g、2g等)进行指定。增大此值可以缓解内存瓶颈,尤其是在处理大数据集时。具体取值应根据集群资源和任务需求进行调整。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ null, ++ null ++ ] ++ }, ++ "spark.executor.memoryOverhead": { ++ "desc": "每个执行器进程要分配的额外内存量,以 MiB 为单位,主要用于存储非JVM内存的开销,包括 VM 开销、内部字符串和其他本机开销等。通常建议的取值为执行器内存的 6-10%。增大此值可以缓解内存瓶颈,尤其是在使用外部库时。具体取值应根据任务需求进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 384, ++ null ++ ] ++ }, ++ "spark.io.compression.codec": { ++ "desc": "用于压缩内部数据的编解码器,例如 RDD 分区、事件日志、广播变量和混洗输出。支持的编码格式包括 LZ4、LZF、Snappy 和 ZSTD。使用压缩可以减少磁盘 IO 和网络带宽的消耗,提升数据传输效率。不同的编码格式在压缩比和压缩速度上有所不同,例如 Snappy 压缩速度快但压缩比相对较低,而 Gzip 压缩比高但速度较慢。默认情况下,Spark 提供四种编解码器:lz4、lzf、snappy 和 zstd。", ++ "type": "discrete", ++ "dtype": "string", ++ "range": [ ++ "lz4", ++ "lzf", ++ "snappy", ++ "zstd" ++ ] ++ }, ++ "spark.kryo.referenceTracking": { ++ "desc": "启用或禁用Kryo序列化的引用跟踪。启用引用跟踪可以减少内存使用和序列化时间,特别是在处理大量重复对象时。这在对象图具有循环时是必需的,并且如果它们包含同一对象的多个副本,则对效率很有用。如果知道情况并非如此,可以禁用它以提高性能。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "true", ++ "false" ++ ] ++ }, ++ "spark.locality.wait": { ++ "desc": "在调度任务时,Spark会等待数据本地性(locality)的时间,以提高任务的执行效率。该参数定义了在放弃并将任务调度到不太本地化的节点之前,Spark等待的时间。等待时间会逐步遍历多个本地化级别(进程本地、节点本地、机架本地等)。如果任务执行时间较长且本地化效果不佳,可以考虑增加此设置,但默认值通常能够满足大多数场景的需求。适当调整此参数可以在CPU和网络瓶颈之间取得平衡。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ null, ++ null ++ ] ++ }, ++ "spark.memory.fraction": { ++ "desc": "spark.memory.fraction参数用于指定执行和存储所使用的内存比例,计算方式为(堆空间 - 300MB)的部分。此值越低,溢出和缓存数据驱逐的频率越高。增大此值可以提高内存利用率,缓解内存瓶颈。建议根据具体任务特性进行调整,默认值为0.6。", ++ "type": "continuous", ++ "dtype": "double", ++ "range": [ ++ 0.0, ++ 1.0 ++ ] ++ }, ++ "spark.memory.offHeap.enabled": { ++ "desc": "启用off-heap内存管理,允许Spark使用JVM外的内存。启用后可以缓解内存瓶颈,尤其是在处理大数据集时。若启用堆外内存使用,则必须设置spark.memory.offHeap.size为正值。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "true", ++ "false" ++ ] ++ }, ++ "spark.memory.offHeap.size": { ++ "desc": "用于堆外分配的内存绝对量,以字节为单位。此设置不会影响堆内存使用情况,因此,如果您的执行器总内存消耗必须符合某个硬性限制,请务必相应地缩小 JVM 堆大小。当 spark.memory.offHeap.enabled=true 时,此值必须设置为正值。增大此值可以提高内存利用率,缓解内存瓶颈。取值通常为如512m、1g等,具体取值应根据集群资源和任务需求进行调整。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 0, ++ null ++ ] ++ }, ++ "spark.reducer.maxSizeInFlight": { ++ "desc": "每个 reduce 任务同时获取的映射输出的最大大小(以 MiB 为单位)。此参数影响 shuffle 性能,增大此值可以提高网络利用率,缓解网络瓶颈。建议根据网络带宽和任务特性进行调整,通常取值为 48m、64m 等。为了避免每个 reduce 任务的固定内存开销,建议将此值设置得较小。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 1, ++ null ++ ] ++ }, ++ "spark.shuffle.compress": { ++ "desc": "启用shuffle数据压缩以减少网络传输和磁盘IO的开销。通常建议启用此功能,以缓解disk IO和network瓶颈。启用时需注意CPU的额外开销。取值为true或false。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "true", ++ "false" ++ ] ++ }, ++ "spark.shuffle.file.buffer": { ++ "desc": "每个混洗文件输出流的内存缓冲区的大小(以 KiB 为单位)。这些缓冲区可以减少在创建中间混洗文件时的磁盘寻址和系统调用次数,从而提高性能。增大此值可以提高磁盘IO性能,缓解disk IO瓶颈。取值通常为如32k、64k等,具体取值应根据任务特性进行调整。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 1, ++ 1048576 ++ ] ++ }, ++ "spark.speculation": { ++ "desc": "启用或禁用任务推测执行。推测执行可以在某些任务运行缓慢时启动额外的副本,以减少整体作业的执行时间。此参数可以缓解由于某些任务的CPU或内存瓶颈导致的性能问题。取值为true时启用推测执行,false时禁用。", ++ "type": "discrete", ++ "dtype": "boolean", ++ "range": [ ++ "true", ++ "false" ++ ] ++ }, ++ "spark.sql.adaptive.maxNumPostShufflePartitions": { ++ "desc": "设置Spark SQL自适应查询执行中,Shuffle后最大分区数。此参数可以帮助优化Shuffle后的数据分布,适当调整可以缓解内存和CPU瓶颈。增大此值可以提高并行度,减小此值可以减少分区数量。建议根据具体的作业需求进行调整,以达到最佳性能。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ] ++ }, ++ "spark.sql.files.maxPartitionBytes": { ++ "desc": "设置Spark SQL读取文件时每个分区的最大字节数。此配置仅在使用基于文件的源(如Parquet、JSON和ORC)时有效。增大此值可以减少分区数量,但可能导致内存瓶颈;减小此值可以增加分区数量,提高并行度,从而影响任务的并行性和内存使用。该参数的默认值为128MB。", ++ "type": "continuous", ++ "dtype": "string", ++ "range": [ ++ 1, ++ null ++ ] ++ }, ++ "spark.task.maxFailures": { ++ "desc": "在放弃作业之前,任何特定任务连续失败的次数。此参数可以帮助在任务失败时进行重试,从而提高作业的成功率。适当增加此值可以缓解由于临时故障导致的任务失败,但过高的值可能会导致资源浪费。", ++ "type": "continuous", ++ "dtype": "int", ++ "range": [ ++ 1, ++ null ++ ] ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/knob_params/system.json b/copilot-tune/src/knowledge_base/knob_params/system.json +new file mode 100644 +index 0000000..93438ea +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/knob_params/system.json +@@ -0,0 +1,1395 @@ ++{ ++ "kernel.sched_cfs_bandwidth_slice_us": { ++ "desc": "1. 当需要更精细控制CFS带宽分配时(如高负载容器环境),可适当减小该值(默认5000微秒),但需注意过小会增加调度开销\n2. 在CPU资源充足且需要减少调度开销的场景下,可增大该值以减少全局时间池的分配频率", ++ "type": "continuous", ++ "range": [ ++ 1000, ++ 50000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.sched_cfs_bandwidth_slice_us=$param_value", ++ "get": "sysctl -n kernel.sched_cfs_bandwidth_slice_us" ++ }, ++ "kernel.sched_latency_ns": { ++ "desc": "1. 当系统运行高优先级实时任务时,若出现调度延迟过高的情况,可适当减小该值以提高调度响应速度\n\n2. 对于CPU密集型负载且任务数量较多(超过8个逻辑CPU)的系统,应增大该值以减少上下文切换开销", ++ "type": "continuous", ++ "range": [ ++ 1000000, ++ 100000000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.sched_latency_ns=$param_value", ++ "get": "sysctl -n kernel.sched_latency_ns" ++ }, ++ "kernel.sched_min_granularity_ns": { ++ "desc": "1. 当系统负载较高且存在大量短时间运行的进程时,可以适当增大该值以减少上下文切换开销,建议从默认值(通常为1,000,000 ns)逐步增加测试,观察性能变化\n\n2. 对于CPU密集型工作负载且进程运行时间普遍较长的情况,可以适当减小该值以提高系统响应能力,建议从默认值逐步减少测试", ++ "type": "continuous", ++ "range": [ ++ 1000000, ++ 100000000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.sched_min_granularity_ns=$param_value", ++ "get": "sysctl -n kernel.sched_min_granularity_ns" ++ }, ++ "transparent_hugepage.defrag": { ++ "desc": "1. 对于需要低延迟的应用(如数据库、实时系统),建议禁用该参数(设置为never),以避免因内存碎片整理导致的性能波动\n2. 对于内存密集型且对延迟不敏感的应用(如批处理作业),建议启用该参数(设置为always或defer+madvise),以提高大内存页使用率减少TLB缺失", ++ "range": [ ++ "always", ++ "defer", ++ "defer+madvise", ++ "madvise", ++ "never" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/kernel/mm/transparent_hugepage/defrag", ++ "get": "cat /sys/kernel/mm/transparent_hugepage/defrag | sed -n 's/.*\\[\\(.*\\)\\].*/\\1/p'" ++ }, ++ "transparent_hugepage.enabled": { ++ "desc": "1. 对于延迟敏感型应用(如数据库、实时系统),建议禁用(设置为never或madvise),以避免因透明大页碎片整理导致的不可预测延迟\n\n2. 对于内存密集型批处理作业(如科学计算、大数据处理),建议启用(设置为always),以通过减少页表项提升内存访问效率", ++ "range": [ ++ "always", ++ "madvise", ++ "never" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/kernel/mm/transparent_hugepage/enabled", ++ "get": "cat /sys/kernel/mm/transparent_hugepage/enabled | sed -n 's/.*\\[\\(.*\\)\\].*/\\1/p'" ++ }, ++ "net.netfilter.nf_conntrack_max": { ++ "desc": "1. 当服务器处理大量并发连接(如超过默认值65536)时,若出现\"table full\"相关内核日志或连接跟踪表频繁满导致丢包,应增加该值(通常设置为总内存MB数/16384,如8GB内存可设为524288)\n\n2. 在高并发短连接场景下,若nf_conntrack_count经常接近nf_conntrack_max值,应结合连接跟踪超时时间(nf_conntrack_tcp_timeout_*系列参数)一同调整,避免过早占满跟踪表", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 67108864 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.netfilter.nf_conntrack_max=$param_value", ++ "get": "sysctl -n net.netfilter.nf_conntrack_max" ++ }, ++ "kernel.pid_max": { ++ "desc": "- 当系统频繁达到当前pid_max限制导致无法创建新进程时,应适当增大该值,通常可设置为默认值(32768)的2-4倍\n- 在容器化环境中若需支持大量短生命周期进程,建议将pid_max提升至262144(2^18)以匹配现代Linux内核支持的上限", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 4194304 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.pid_max=$param_value", ++ "get": "sysctl -n kernel.pid_max" ++ }, ++ "kernel.shmmni": { ++ "desc": "- 当运行需要大量共享内存段的数据库(如Oracle)或科学计算应用时,若出现\"SHMMNI\"相关错误日志,应增加该值至超过应用实际需求的20%冗余量\n\n- 在容器化或虚拟化环境中,若单个物理节点需承载多个共享内存密集型实例,应按实例数乘以单个实例需求量的1.5倍进行设置", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 16384 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.shmmni=$param_value", ++ "get": "sysctl -n kernel.shmmni" ++ }, ++ "kernel.shmmax": { ++ "desc": "1. 当运行需要大量共享内存的应用(如Oracle数据库、SAP HANA等)时,如果应用报错提示共享内存不足,需要将kernel.shmmax设置为至少等于所有共享内存段总和的80%-90%,但不超过物理内存的90%\n\n2. 在容器化或虚拟化环境中,当多个实例需要共享内存通信且出现性能瓶颈时,应根据每个实例的实际共享内存需求总和来调整kernel.shmmax,确保其值大于所有实例需求之和", ++ "type": "continuous", ++ "range": [ ++ 17179869184, ++ 68719476736 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.shmmax=$param_value", ++ "get": "sysctl -n kernel.shmmax" ++ }, ++ "kernel.shmall": { ++ "desc": "1. 当系统运行需要大量共享内存的应用(如Oracle数据库)且出现\"SHMMAX too small\"错误时,需要增加该值至物理内存的80%左右\n\n2. 当系统频繁使用共享内存但未充分利用物理内存时,可适当降低该值以避免资源浪费,通常设置为(总物理内存 - 系统保留内存) / PAGE_SIZE", ++ "type": "continuous", ++ "range": [ ++ 1073741824, ++ 8589934592 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.shmall=$param_value", ++ "get": "sysctl -n kernel.shmall" ++ }, ++ "kernel.core_uses_pid": { ++ "desc": "1. 当需要快速定位崩溃进程时,建议启用该参数(设为1),通过core文件名中的PID可以快速关联到具体进程信息\n\n2. 当系统频繁产生core文件且磁盘空间紧张时,建议禁用该参数(设为0),避免文件名过长导致管理困难", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.core_uses_pid=$param_value", ++ "get": "sysctl -n kernel.core_uses_pid" ++ }, ++ "kernel.msgmni": { ++ "desc": "1. 当系统日志频繁出现\"msgmni limit reached\"错误时,表明当前队列数量不足以支撑应用需求,需要增加该值\n\n2. 对于频繁使用System V消息队列的中间件应用(如Oracle数据库),建议将该值设置为进程数量的4倍以上", ++ "type": "continuous", ++ "range": [ ++ 8000, ++ 128000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.msgmni=$param_value", ++ "get": "sysctl -n kernel.msgmni" ++ }, ++ "kernel.msgmax": { ++ "desc": "1. 当应用频繁发送超过当前 kernel.msgmax 限制的大消息导致消息队列操作失败时,应适当增大该值,但需确保不超过系统可用内存的合理比例\n\n2. 若系统存在大量小消息传输且 msgmax 设置过大导致内存碎片化,应降低该值以匹配实际消息大小,通常不低于 8KB", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.msgmax=$param_value", ++ "get": "sysctl -n kernel.msgmax" ++ }, ++ "kernel.msgmnb": { ++ "desc": "增加该值当消息队列频繁达到默认上限(通常为16384字节)导致应用报错时 \n降低该值当系统存在大量闲置消息队列且需要减少内核内存占用时", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.msgmnb=$param_value", ++ "get": "sysctl -n kernel.msgmnb" ++ }, ++ "kernel.hung_task_timeout_secs": { ++ "desc": "1. 当系统频繁出现hung_task警告但实际任务仍在正常执行时,可适当增大该值(如从默认120秒调整为300秒),避免误报\n\n2. 对于存储密集型应用(如数据库服务器),若观察到存储设备响应较慢导致任务频繁超时,应结合存储延迟指标调高该值至存储设备平均响应时间的2-3倍", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 1200 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.hung_task_timeout_secs=$param_value", ++ "get": "sysctl -n kernel.hung_task_timeout_secs" ++ }, ++ "kernel.nmi_watchdog": { ++ "desc": "1. 在生产服务器上建议禁用该参数(设置为0),因为NMI watchdog会周期性触发NMI中断,可能对系统性能产生轻微影响,尤其在高负载场景下\n\n2. 在调试内核死锁或硬件问题时可以临时启用(设置为1),帮助捕获长时间关中断导致的挂起问题", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.nmi_watchdog=$param_value", ++ "get": "sysctl -n kernel.nmi_watchdog" ++ }, ++ "kernel.sched_rt_runtime_us": { ++ "desc": "1. 当系统需要运行更多实时任务时,可以适当增加该值(但不超过sched_rt_period_us的95%),默认值950000微秒可提高到990000微秒\n\n2. 当非实时任务出现严重饥饿现象时,应减小该值(建议不低于800000微秒),为普通任务保留更多CPU时间", ++ "type": "continuous", ++ "range": [ ++ 950000, ++ 1000000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.sched_rt_runtime_us=$param_value", ++ "get": "sysctl -n kernel.sched_rt_runtime_us" ++ }, ++ "kernel.timer_migration": { ++ "desc": "1. 在NUMA架构服务器上运行低延迟应用时,若出现跨节点时钟中断导致的性能抖动,应禁用该参数以保持本地CPU处理时钟中断\n\n2. 当系统负载主要集中运行在单个NUMA节点且出现时钟中断处理不均衡时,可启用该参数允许时钟中断在CPU间迁移", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.timer_migration=$param_value", ++ "get": "sysctl -n kernel.timer_migration" ++ }, ++ "kernel.threads-max": { ++ "desc": "1. 当系统频繁出现\"fork: Cannot allocate memory\"错误或应用程序因无法创建新线程而崩溃时,需要增加该值。可通过计算系统内存容量和单个线程平均内存占用来确定合理上限,通常设置为物理内存(MB)/8。\n\n2. 在高并发容器环境或运行大量轻量级线程的应用(如Java微服务)时,若/proc/sys/kernel/pid_max已调高但仍有线程创建限制,应将该值提升至至少pid_max值的2倍。", ++ "type": "continuous", ++ "range": [ ++ 655360, ++ 65536000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.threads-max=$param_value", ++ "get": "sysctl -n kernel.threads-max" ++ }, ++ "kernel.sysrq": { ++ "desc": "1. 生产环境中建议设置为1(仅启用基本功能)或0(完全禁用),避免通过SysRq组合键意外触发系统操作,降低安全风险\n\n2. 调试崩溃或死机问题时临时设置为1或更大值(如176/128),启用更多调试功能后需立即恢复默认安全配置", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.sysrq=$param_value", ++ "get": "sysctl -n kernel.sysrq" ++ }, ++ "kernel.sched_autogroup_enabled": { ++ "desc": "1. 在服务器环境下建议禁用该参数(设为0),因为自动任务分组主要针对桌面交互程序优化,服务器工作负载通常不需要这种调度特性\n\n2. 当服务器运行大量短时交互式任务且出现调度延迟问题时,可尝试启用(设为1)观察效果,但需注意可能影响批处理任务的吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.sched_autogroup_enabled=$param_value", ++ "get": "sysctl -n kernel.sched_autogroup_enabled" ++ }, ++ "kernel.numa_balancing": { ++ "desc": "1. 当系统运行NUMA架构且应用存在跨节点内存访问时,应启用该参数(设置为1)以减少远程内存访问延迟\n2. 对于内存密集型且对延迟敏感的应用,若观察到较高比例的跨节点内存访问,建议禁用该参数(设置为0)以避免自动平衡带来的性能波动", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.numa_balancing=$param_value", ++ "get": "sysctl -n kernel.numa_balancing" ++ }, ++ "kernel.randomize_va_space": { ++ "desc": "1. 当系统运行安全性要求较高的服务时,建议保持默认值2(完全随机化),以增强对抗内存攻击的能力\n\n2. 若应用程序出现因地址随机化导致的兼容性问题,且运行环境可信,可临时调整为1(仅对数据段随机化)或0(禁用)进行测试", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w kernel.randomize_va_space=$param_value", ++ "get": "sysctl -n kernel.randomize_va_space" ++ }, ++ "kernel.dmesg_restrict": { ++ "desc": "1. 如果系统需要满足安全合规要求(如PCI-DSS、HIPAA等),建议设置为1以限制普通用户查看内核日志,防止敏感信息泄露\n2. 在需要开发调试或故障排查的环境中,建议设置为0以便非特权用户也能查看完整的系统日志信息", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w kernel.dmesg_restrict=$param_value", ++ "get": "sysctl -n kernel.dmesg_restrict" ++ }, ++ "vm.swappiness": { ++ "desc": "1. 对于内存密集型应用(如数据库、缓存服务),建议将 vm.swappiness 设置为 10-30 以减少交换空间使用,优先利用物理内存\n2. 当系统频繁发生 OOM (Out of Memory) 时,可适当提高 vm.swappiness 至 60-80 以增加交换空间使用,避免进程被强制终止", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.swappiness=$param_value", ++ "get": "sysctl -n vm.swappiness" ++ }, ++ "vm.vfs_cache_pressure": { ++ "desc": "1. 当系统频繁进行目录和inode缓存回收导致性能下降时,可适当降低该值(如设为50-100),减少内核回收缓存内存的频率\n\n2. 当系统内存充足但缓存利用率不足时,可适当提高该值(如设为150-200),促使内核更积极地回收缓存内存", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 500 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.vfs_cache_pressure=$param_value", ++ "get": "sysctl -n vm.vfs_cache_pressure" ++ }, ++ "vm.dirty_background_ratio": { ++ "desc": "1. 对于写入密集型应用(如数据库服务器),建议将值从默认的10%提高到15-20%,以减少频繁的后台刷写对I/O性能的影响\n\n2. 对于内存较小的系统(如低于8GB),建议保持默认值或降至5-10%,以避免过多脏页堆积导致内存压力", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.dirty_background_ratio=$param_value", ++ "get": "sysctl -n vm.dirty_background_ratio" ++ }, ++ "vm.dirty_ratio": { ++ "desc": "1. 当系统频繁因脏页刷盘导致I/O瓶颈时,可适当降低该值(如从默认20%降至10%),以减少单次刷盘的数据量,但会增加刷盘频率\n\n2. 若系统内存较大且主要处理顺序写入负载,可适当提高该值(如升至30%-40%),利用内存缓冲更多脏数据,减少磁盘I/O次数", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.dirty_ratio=$param_value", ++ "get": "sysctl -n vm.dirty_ratio" ++ }, ++ "vm.stat_interval": { ++ "desc": "1. 当系统需要更频繁监控内存使用情况(如内存压力大或频繁交换时),可适当减小该值(如从默认10秒降至5秒),但需注意增加的系统开销 \n\n2. 在内存使用稳定且低负载环境中,可增大该值(如调至30秒)以减少/proc/vmstat的更新频率,降低内核开销", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.stat_interval=$param_value", ++ "get": "sysctl -n vm.stat_interval" ++ }, ++ "vm.dirty_expire_centisecs": { ++ "desc": "1. 对于需要快速持久化数据的应用(如数据库),建议将值调低至100-300(1-3秒),以减少数据丢失风险 \n2. 对于写入密集型负载且对延迟敏感的应用,可适当提高至1000-3000(10-30秒),通过合并更多写操作来提升I/O吞吐量", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.dirty_expire_centisecs=$param_value", ++ "get": "sysctl -n vm.dirty_expire_centisecs" ++ }, ++ "vm.dirty_writeback_centisecs": { ++ "desc": "1. 当系统频繁出现I/O等待或磁盘写入延迟较高时,可适当降低该值(如从默认500调整为200-300),以加快脏页回写频率,减少突发写入导致的性能波动\n\n2. 对于写入密集型负载且使用电池供电的设备(如服务器UPS环境),可适当提高该值(如设置为1000-1500),通过减少磁盘写入次数来降低I/O开销和能耗", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.dirty_writeback_centisecs=$param_value", ++ "get": "sysctl -n vm.dirty_writeback_centisecs" ++ }, ++ "vm.overcommit_ratio": { ++ "desc": "1. 当物理服务器内存使用率长期低于50%且需要运行大量内存申请不确定的应用程序时,可适当提高该比例(如设置为80-90%)以提升内存利用率\n\n2. 在内存密集型应用场景下,若频繁触发OOM killer且监控显示实际内存使用接近物理内存总量,应降低该比例(如设置为50-70%)以避免过度承诺内存", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.overcommit_ratio=$param_value", ++ "get": "sysctl -n vm.overcommit_ratio" ++ }, ++ "vm.overcommit_memory": { ++ "desc": "1. 当系统运行内存密集型应用且频繁触发OOM killer时,建议将值设为0(保守策略)或2(严格策略)以避免过度分配\n\n2. 当系统主要运行已知内存需求的批量任务且需要最大化内存利用率时,可设为1(总是允许过度分配)以提升吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w vm.overcommit_memory=$param_value", ++ "get": "sysctl -n vm.overcommit_memory" ++ }, ++ "vm.min_free_kbytes": { ++ "desc": "1. 当系统频繁触发直接内存回收(direct reclaim)导致性能下降时,需要增加该值以减少直接回收频率,建议设置为物理内存的1-3%\n\n2. 当系统存在大量不可移动页(unmovable pages)导致内存碎片化严重时,需适当提高该值以预留更多连续内存空间", ++ "type": "continuous", ++ "range": [ ++ 10240, ++ 1024000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.min_free_kbytes=$param_value", ++ "get": "sysctl -n vm.min_free_kbytes" ++ }, ++ "vm.page-cluster": { ++ "desc": "1. 当系统频繁进行大块连续内存交换时,可适当增大该值(默认3,建议范围3-10),减少交换操作的I/O开销\n\n2. 在SSD存储的交换分区环境中,由于随机访问性能较好,可降低该值(建议1-3)以减少单次交换延迟", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 8 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.page-cluster=$param_value", ++ "get": "sysctl -n vm.page-cluster" ++ }, ++ "vm.max_map_count": { ++ "desc": "增加该值当运行内存密集型应用(如Elasticsearch或数据库)时出现\"max virtual memory areas vm.max_map_count [65530] is too low\"错误\n\n将该值设置为262144或更高当运行需要大量内存映射的Java应用(如Hadoop或Spark)时", ++ "type": "continuous", ++ "range": [ ++ 100000, ++ 10000000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.max_map_count=$param_value", ++ "get": "sysctl -n vm.max_map_count" ++ }, ++ "vm.zone_reclaim_mode": { ++ "desc": "1. 当系统运行在NUMA架构且存在跨节点内存访问延迟问题时,建议将vm.zone_reclaim_mode设置为1,优先尝试在本地节点回收内存以减少远程访问延迟\n\n2. 当系统内存压力较大且本地节点回收效果不佳时,建议将vm.zone_reclaim_mode设置为0,允许从其他节点回收内存以提高整体回收效率", ++ "range": [ ++ "0", ++ "1", ++ "2", ++ "4" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w vm.zone_reclaim_mode=$param_value", ++ "get": "sysctl -n vm.zone_reclaim_mode" ++ }, ++ "vm.watermark_scale_factor": { ++ "desc": "1. 当系统频繁触发直接内存回收(direct reclaim)且kswapd进程活跃度不足时,可适当降低该值(如从默认的10调整至5-8),使kswapd更早介入内存回收\n\n2. 在内存压力较大且kswapd持续运行的场景下,若观察到系统响应延迟增加,可尝试增大该值(如调整至15-20),延迟kswapd休眠时机以提升回收效率", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 1000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w vm.watermark_scale_factor=$param_value", ++ "get": "sysctl -n vm.watermark_scale_factor" ++ }, ++ "vm.numa_stat": { ++ "desc": "1. 当系统内存资源紧张且NUMA统计对当前业务场景不重要时,可将该参数设为0以降低统计精度,减少内存开销\n2. 在需要精确监控NUMA内存行为的高性能计算场景中,应保持该参数启用(默认值1)以获得完整统计信息", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w vm.numa_stat=$param_value", ++ "get": "sysctl -n vm.numa_stat" ++ }, ++ "vm.drop_caches": { ++ "desc": "1. 当系统内存压力较大且缓存占用过高时,可临时设置为3释放pagecache、dentries和inodes缓存,但不宜频繁操作以免影响性能\n\n2. 在运行内存密集型应用前,可设置为1仅释放pagecache,避免缓存干扰应用性能测试结果", ++ "range": [ ++ "1", ++ "2", ++ "3" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w vm.drop_caches=$param_value", ++ "get": "sysctl -n vm.drop_caches" ++ }, ++ "fs.inotify.max_user_watches": { ++ "desc": "1. 当监控大量文件或目录时(如日志目录、代码仓库等),若出现\"Too many open files\"或\"User limit of inotify watches reached\"错误,需增加该值\n\n2. 对于高并发文件监控场景(如实时日志分析、文件同步服务),建议将该值调整为默认值(通常8192)的4-8倍,具体数值应根据实际监控文件数量确定", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 819200 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.inotify.max_user_watches=$param_value", ++ "get": "sysctl -n fs.inotify.max_user_watches" ++ }, ++ "fs.nr_open": { ++ "desc": "1. 当应用(如数据库、Web服务器)频繁报告\"too many open files\"错误且ulimit -n已调高时,需增加该值至大于等于进程实际需要的最大文件描述符数\n\n2. 在内存资源紧张的系统中,若该值设置过高(如接近memlock限制),应适当降低以防止内存耗尽", ++ "type": "continuous", ++ "range": [ ++ 10240, ++ 1024000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.nr_open=$param_value", ++ "get": "sysctl -n fs.nr_open" ++ }, ++ "fs.file-max": { ++ "desc": "1. 当系统频繁出现\"Too many open files\"错误或监控显示文件句柄使用率持续接近当前限制时,需要增加该值\n\n2. 对于高并发服务(如Web服务器、数据库等),建议将该值设置为物理内存大小(KB)的10%-20%(例如64GB内存可设置为6,400,000-12,800,000)", ++ "type": "continuous", ++ "range": [ ++ 102400, ++ 10240000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.file-max=$param_value", ++ "get": "sysctl -n fs.file-max" ++ }, ++ "fs.aio-max-nr": { ++ "desc": "- 当系统日志频繁出现\"aio-max-nr reached\"警告或应用程序因异步I/O请求被拒绝而报错时,需要增加该值\n- 对于高并发数据库服务器(如MySQL/PostgreSQL)或大规模文件处理应用,建议将该值设置为(并发线程数×每个线程可能持有的未完成AIO请求数)×2", ++ "type": "continuous", ++ "range": [ ++ 102400, ++ 10240000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.aio-max-nr=$param_value", ++ "get": "sysctl -n fs.aio-max-nr" ++ }, ++ "fs.inotify.max_user_instances": { ++ "desc": "1. 当系统日志频繁出现\"inotify instance limit reached\"或类似错误时,表明当前用户运行的监控进程(如文件同步工具、开发热加载工具等)数量超过限制,需要增加该值\n\n2. 对于运行大量容器或微服务的环境,每个容器实例可能需要独立的inotify实例监控文件变化,此时应根据实际容器数量合理调高该参数", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 65535 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.inotify.max_user_instances=$param_value", ++ "get": "sysctl -n fs.inotify.max_user_instances" ++ }, ++ "fs.suid_dumpable": { ++ "desc": "1. 当系统需要调试setuid程序崩溃问题时,建议将值设为1(debug模式),允许生成核心转储文件用于故障分析\n\n2. 在注重安全性的生产环境中,建议保持默认值0,避免潜在的安全风险,防止敏感信息通过核心转储泄露", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w fs.suid_dumpable=$param_value", ++ "get": "sysctl -n fs.suid_dumpable" ++ }, ++ "blockdev": { ++ "desc": "增大预读值(如设置为8192)可提升顺序读性能,适用于频繁大文件顺序读场景\n\n降低预读值(如设置为128)可减少IO开销,适用于随机访问为主的场景", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2147483648 ++ ], ++ "dtype": "int", ++ "set": "/sbin/blockdev --setra $param_value /dev/sda", ++ "get": "/sbin/blockdev --getra /dev/sda" ++ }, ++ "block.fifo_batch": { ++ "desc": "1. 当系统需要更高吞吐量且能容忍更高延迟时(如批量数据处理场景),可适当增大该值(如32-64) \n2. 当系统对延迟敏感(如实时数据库)且当前吞吐量足够时,可降低该值(如8-12)以减少单个批次的处理延迟", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 128 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/iosched/fifo_batch", ++ "get": "cat /sys/block/sda/queue/iosched/fifo_batch" ++ }, ++ "block.front_merges": { ++ "desc": "1. 在I/O负载主要来自顺序写入且存储设备性能良好时,建议保持默认值1以允许前向合并,这能减少请求数量提升吞吐量\n\n2. 当系统处理大量随机I/O或使用某些特定存储设备时出现性能下降,可尝试将该参数设为0禁用前向合并,减少不必要的合并操作开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/iosched/front_merges", ++ "get": "cat /sys/block/sda/queue/iosched/front_merges" ++ }, ++ "block.read_expire": { ++ "desc": "1. 当系统主要处理高优先级读操作(如数据库查询)且存在读延迟敏感型应用时,可适当降低该值(如从默认的125ms降至50-100ms),确保读请求能更快得到响应\n\n2. 若系统频繁出现读请求超时丢弃现象(可通过监控deadline调度器的统计信息发现),且存储设备实际响应能力优于当前设置,应适当调高该值(如增至150-200ms)以避免不必要的请求重试", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/iosched/read_expire", ++ "get": "cat /sys/block/sda/queue/iosched/read_expire" ++ }, ++ "block.writes_starved": { ++ "desc": "1. 当系统主要处理随机读取密集型负载(如数据库服务)且需要低延迟响应时,可适当提高该值(默认2-5范围),优先处理读请求以减少读延迟\n\n2. 当系统存在大量顺序写操作(如日志写入、数据备份)且写性能成为瓶颈时,应降低该值(最小可设为1),防止读请求过度抢占I/O带宽影响写入吞吐量", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 10 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/iosched/writes_starved", ++ "get": "cat /sys/block/sda/queue/iosched/writes_starved" ++ }, ++ "block.max_sectors_kb": { ++ "desc": "1. 当使用高性能存储设备(如NVMe SSD)且存在大量大块I/O操作时,可适当增大该值(如1024-4096 KB)以提高吞吐量\n2. 当出现I/O错误或设备驱动不稳定时,应降低该值至默认值512 KB或更小以增强稳定性", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/max_sectors_kb", ++ "get": "cat /sys/block/sda/queue/max_sectors_kb" ++ }, ++ "block.queue_depth": { ++ "desc": "1. 当使用高性能存储设备(如NVMe SSD)且系统负载较高时,若观察到存储设备利用率不足或IOPS未达预期,可适当增加该值(通常建议从默认32逐步上调至64-256范围),但需确保不超过设备硬件队列深度限制\n\n2. 对于虚拟机环境或低性能旋转磁盘(如HDD),若延迟显著增加或出现请求超时,应将值降低至16-32范围以减少IO堆积", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/device/queue_depth", ++ "get": "cat /sys/block/sda/device/queue_depth" ++ }, ++ "block.nr_requests": { ++ "desc": "1. 当系统有高性能存储设备(如NVMe SSD)且IOPS吞吐量不足时,可适当增加该值(默认128),建议范围256-1024,以充分发挥设备并行处理能力\n\n2. 当系统出现高延迟或请求堆积时,若存储设备为机械硬盘,应降低该值(建议64-128),避免单个设备队列过深导致寻道时间增加", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 2048 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/nr_requests", ++ "get": "cat /sys/block/sda/queue/nr_requests" ++ }, ++ "block.read_ahead_kb": { ++ "desc": "1. 当系统主要运行顺序读取大文件的应用(如数据库、视频流服务)且内存充足时,可适当增大该值(如从默认128KB调整为512KB-1MB),以减少I/O等待时间\n\n2. 当系统内存压力较大或主要处理随机访问负载时,应降低该值(如调整为64KB或更低),避免预读过多无用数据占用宝贵的内存资源", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int", ++ "set": "echo $param_value > /sys/block/sda/queue/read_ahead_kb", ++ "get": "cat /sys/block/sda/queue/read_ahead_kb" ++ }, ++ "block.rq_affinity": { ++ "desc": "1. 当系统在高I/O负载下出现CPU利用率不均衡时,建议调整该参数以提高本地CPU处理I/O请求的效率\n\n2. 在使用多队列存储设备时,若发现I/O性能未达到预期,建议调整此参数以充分利用多核CPU的并行处理能力", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/rq_affinity", ++ "get": "cat /sys/block/sda/queue/rq_affinity" ++ }, ++ "block.add_random": { ++ "desc": "1. 当系统对随机数质量要求极高且性能开销可接受时,建议启用该参数以增强熵池的随机性来源\n\n2. 在高性能计算或低延迟要求的场景下,若系统已有足够熵源,建议禁用该参数以避免I/O事件带来的额外开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/add_random", ++ "get": "cat /sys/block/sda/queue/add_random" ++ }, ++ "block.rotational": { ++ "desc": "1. 当存储设备为SSD时,必须将该参数设置为0,以避免系统错误地应用针对机械硬盘的I/O调度策略\n\n2. 当存储设备为机械硬盘时,该参数应保持默认值1,以确保系统能正确应用适合旋转介质的相关优化", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/rotational", ++ "get": "cat /sys/block/sda/queue/rotational" ++ }, ++ "block.scheduler": { ++ "desc": "1. 对于MySQL数据库场景,建议将block.scheduler设置为deadline,该调度算法能更好地处理数据库的随机I/O负载,减少I/O延迟\n\n2. 如果系统使用的是SSD存储设备,可以考虑设置为noop调度器,因为SSD没有机械磁盘的寻道时间,简单的FIFO队列调度即可发挥最佳性能", ++ "range": [ ++ "mq-deadline", ++ "kyber", ++ "bfq", ++ "none" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/scheduler", ++ "get": "cat /sys/block/sda/queue/scheduler | sed -n 's/.*\\[\\(.*\\)\\].*/\\1/p'" ++ }, ++ "block.write_cache": { ++ "desc": "1. 当系统需要更高的写入性能且能容忍少量数据丢失风险时,建议设置为 write back 模式\n\n2. 当数据安全性要求极高且性能不是首要考虑时,建议设置为 write through 模式", ++ "range": [ ++ "write back", ++ "write through" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/write_cache", ++ "get": "cat /sys/block/sda/queue/write_cache" ++ }, ++ "block.nomerges": { ++ "desc": "1. 仅在调试I/O请求合并相关问题时设置为0,生产环境应保持默认值1以获得合并带来的性能优势 \n2. 当使用blktrace等工具进行底层块设备分析时,可临时禁用合并(设为0)以获取更精确的请求跟踪数据", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "echo $param_value > /sys/block/sda/queue/nomerges", ++ "get": "cat /sys/block/sda/queue/nomerges" ++ }, ++ "net.core.netdev_budget": { ++ "desc": "1. 当网络接口频繁出现丢包(ifconfig显示RX dropped增加)且CPU软中断(softirq)占用过高时,建议适当增大该值(默认300可尝试调整为600-800)以提升单次软中断处理的包数量,减少中断次数\n\n2. 在低吞吐量但延迟敏感型场景(如高频交易系统)中,若网络延迟出现波动,可尝试降低该值(如调整为150-200)以减少单次软中断处理时间,降低处理延迟", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.netdev_budget=$param_value", ++ "get": "sysctl -n net.core.netdev_budget" ++ }, ++ "net.core.optmem_max": { ++ "desc": "- 当应用程序(如高性能网络服务)需要处理大量并发连接或大容量数据时,若出现 socket 缓冲区不足导致的性能瓶颈,可适当增加该值\n- 在内存资源充足的服务器上,若默认值(通常为 20480)无法满足特定应用(如视频流、大数据传输)的需求,可按 2 的幂次方逐步调高至合理范围(如 65536 或 131072)", ++ "type": "continuous", ++ "range": [ ++ 20480, ++ 204800 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.optmem_max=$param_value", ++ "get": "sysctl -n net.core.optmem_max" ++ }, ++ "net.core.wmem_max": { ++ "desc": "1. 当服务器处理大量高吞吐量网络连接(如视频流、大文件传输等场景)时出现写缓冲区不足导致的性能瓶颈,建议将值从默认229376调整为16777216\n\n2. 在高并发TCP长连接场景(如WebSocket服务、消息队列等)中观察到因写缓冲区溢出导致的连接异常或数据丢失时,建议采用16777216值", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 67108864 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.wmem_max=$param_value", ++ "get": "sysctl -n net.core.wmem_max" ++ }, ++ "net.core.wmem_default": { ++ "desc": "1. 当应用主要处理大量小数据包传输时,若网络吞吐量低于预期且系统监控显示发送缓冲区频繁填满,可适当增大该值至32768-65535字节范围,减少频繁缓冲区填满导致的延迟 \n\n2. 在高带宽高延迟网络环境下(如跨数据中心传输),若TCP窗口缩放功能已启用但实际窗口仍受限于默认值,应将该值提升至至少163840字节(160KB)以匹配BDP(带宽延迟积)需求", ++ "type": "continuous", ++ "range": [ ++ 8192, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.wmem_default=$param_value", ++ "get": "sysctl -n net.core.wmem_default" ++ }, ++ "net.core.rmem_default": { ++ "desc": "1. 当应用需要处理大量网络数据流(如视频流、大数据传输)且观察到频繁的TCP窗口缩放或重传时,建议将值从默认的212992字节提升至1-4MB范围(1048576-4194304字节),需配合net.core.rmem_max同步调整\n\n2. 在高吞吐低延迟网络环境(如10Gbps以上)中,若netstat -s显示\"pruned\"或\"collapsed\"包统计持续增长,建议将值设置为BDP(带宽延迟积)的1/4至1/2,计算公式为:(带宽(bps) × 往返时延(s)) / 8 × 0.25", ++ "type": "continuous", ++ "range": [ ++ 8192, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.rmem_default=$param_value", ++ "get": "sysctl -n net.core.rmem_default" ++ }, ++ "net.core.rmem_max": { ++ "desc": "1. 当应用需要处理高吞吐量网络数据流(如视频流、大数据传输)时,应将此值调整为16777216以提升接收性能 \n2. 在存在大量TCP长连接且频繁出现接收缓冲区不足警告(如内核日志报\"TCP: too much of memory\")时,应增大该值", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 67108864 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.rmem_max=$param_value", ++ "get": "sysctl -n net.core.rmem_max" ++ }, ++ "net.core.netdev_max_backlog": { ++ "desc": "1. 当服务器频繁出现网络丢包或高负载时,且通过监控发现 netdev_backlog 值持续接近或达到当前 netdev_max_backlog 设置值,应适当增大该参数值(例如从默认的1000调整为2000-3000) \n\n2. 对于10Gbps及以上高速网络接口,若默认值导致数据包处理延迟增加,需根据实际网络吞吐量和CPU处理能力按比例提升该参数值", ++ "type": "continuous", ++ "range": [ ++ 1000, ++ 100000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.netdev_max_backlog=$param_value", ++ "get": "sysctl -n net.core.netdev_max_backlog" ++ }, ++ "net.ipv4.tcp_thin_linear_timeouts": { ++ "desc": "1. 当服务器处理大量短生命周期TCP连接且频繁出现超时重传时,建议启用该参数(tcp_thin_linear_timeouts=1)以更精确检测瘦流并减少不必要的重传等待时间 \n\n2. 若服务器主要处理大文件传输或视频流等持续高吞吐连接,建议保持默认值(tcp_thin_linear_timeouts=0)以避免对正常数据流产生误判", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_thin_linear_timeouts=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_thin_linear_timeouts" ++ }, ++ "net.unix.max_dgram_qlen": { ++ "desc": "1. 当系统频繁处理大量UDP数据报且出现丢包现象时,应考虑增加该值以提高队列容量\n2. 在高吞吐量UDP应用场景中,若观察到应用处理速度跟不上数据接收速度导致队列溢出,应适当调高此参数", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.unix.max_dgram_qlen=$param_value", ++ "get": "sysctl -n net.unix.max_dgram_qlen" ++ }, ++ "net.core.somaxconn": { ++ "desc": "1. 当服务器需要处理大量并发连接请求(如高负载Web服务器)且出现连接被丢弃或排队延迟时,应将此值从默认128增大到1024或更高\n\n2. 在运行需要频繁建立短连接的服务(如反向代理、负载均衡器)时,建议将该值调整为至少等于或大于服务的worker_processes与worker_connections乘积的1/4", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 65536 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.somaxconn=$param_value", ++ "get": "sysctl -n net.core.somaxconn" ++ }, ++ "net.core.busy_poll": { ++ "desc": "1. 在高吞吐量网络环境中(如10Gbps以上),若CPU利用率不足且存在延迟敏感型应用,可适当增加该值(如50-100微秒)以减少中断频率,但需监控CPU负载避免过度占用\n\n2. 在低延迟网络环境(如高频交易系统)中,若网络延迟指标不达标且CPU资源充足,可尝试设置为0禁用该功能,强制使用中断模式降低延迟", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 200 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.busy_poll=$param_value", ++ "get": "sysctl -n net.core.busy_poll" ++ }, ++ "net.core.busy_read": { ++ "desc": "1. 当网络设备处理高吞吐量小包时出现频繁读超时或性能下降,可尝试增加该值至100-200微秒范围,需结合具体硬件性能测试确定最优值\n\n2. 在低延迟网络环境中若观察到CPU使用率异常升高且与网络中断处理相关,可测试降低该值至20-30微秒以减少等待时间", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 200 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.busy_read=$param_value", ++ "get": "sysctl -n net.core.busy_read" ++ }, ++ "net.core.dev_weight": { ++ "desc": "1. 当网络中断处理成为性能瓶颈时(通过监控发现CPU软中断时间占比过高),可适当增加该值以提高单次中断处理的数据包数量,但需注意避免单个CPU过载\n\n2. 对于高吞吐量网卡(如10G/25G以上)或大量小包场景,建议将该值从默认的64提高到128-256范围,需结合具体硬件和负载测试确定最优值", ++ "type": "continuous", ++ "range": [ ++ 16, ++ 1024 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.dev_weight=$param_value", ++ "get": "sysctl -n net.core.dev_weight" ++ }, ++ "net.ipv4.tcp_keepalive_intvl": { ++ "desc": "1. 当服务器需要检测长时间空闲连接的健康状态时,若默认值75秒导致故障检测延迟过高,可适当减小至30-60秒范围以加快故障发现,但需权衡网络负载增加的影响\n\n2. 在高延迟网络环境中,若频繁出现误判连接中断的情况,可考虑增大该值至90-120秒范围以减少不必要的探测流量,同时需配合调整tcp_keepalive_probes确保总体检测窗口合理", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 300 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_keepalive_intvl=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_keepalive_intvl" ++ }, ++ "net.ipv4.tcp_keepalive_probes": { ++ "desc": "1. 当服务器需要快速检测并释放失效连接(如负载均衡器后端健康检查场景)时,可适当减少该值(默认9),建议调整为3-5次以加快失效连接回收\n\n2. 在高延迟或不可靠网络环境中(如跨国VPN),为防止误判活跃连接为失效,应增大该值至12-15次,同时配合调整tcp_keepalive_time和tcp_keepalive_intvl参数", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 144 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_keepalive_probes=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_keepalive_probes" ++ }, ++ "net.ipv4.tcp_keepalive_time": { ++ "desc": "1. 当服务器需要检测长时间空闲连接的有效性时(如负载均衡器或反向代理场景),若默认值(7200秒)过长可能导致无效连接占用资源,可适当调低至300-600秒区间 \n\n2. 在高并发短连接业务场景下,若出现大量TIME_WAIT状态连接导致端口耗尽,可配合减小tcp_keepalive_probes和tcp_keepalive_intvl参数,将本参数值提升至10800秒以上以减少keepalive探测频率", ++ "type": "continuous", ++ "range": [ ++ 600, ++ 36000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_keepalive_time=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_keepalive_time" ++ }, ++ "net.ipv4.tcp_tw_reuse": { ++ "desc": "1. 当服务器面临大量短连接请求且TIME-WAIT状态连接过多导致端口耗尽时,建议启用该参数(设置为1)以复用TIME-WAIT套接字\n2. 在NAT网络环境下或需要严格保证TCP连接可靠性的场景下,建议保持该参数为默认值0以避免潜在连接混乱风险", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_tw_reuse=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_tw_reuse" ++ }, ++ "net.ipv4.tcp_window_scaling": { ++ "desc": "1. 在高带宽或高延迟网络环境下(如长距离传输或高速网络),应确保该参数值为1以启用窗口缩放功能,提升大窗口TCP连接性能\n\n2. 当网络设备不支持RFC 1323或存在兼容性问题时,应将该参数设为0以禁用窗口缩放功能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_window_scaling=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_window_scaling" ++ }, ++ "net.ipv4.tcp_fin_timeout": { ++ "desc": "1. 当服务器需要处理大量短连接请求时,如果观察到大量连接处于FIN_WAIT_2状态导致端口耗尽,建议将该值从默认的60秒降低到30秒或更低,以加快连接资源释放\n\n2. 对于高延迟网络环境或需要保持长时间连接的应用场景,如果发现连接异常终止问题,建议适当增加该值至120秒以上,确保连接正常关闭", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 120 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_fin_timeout=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_fin_timeout" ++ }, ++ "net.ipv4.udp_mem": { ++ "desc": "1. 当服务器频繁处理大量UDP流量(如DNS服务器、视频流服务器)且出现丢包或性能下降时,可适当增加high值(如默认值的2-3倍),确保有足够内存缓冲队列数据包\n\n2. 若系统空闲内存充足但UDP应用仍频繁触发压力模式(可通过监控/proc/net/sockstat观察),应按比例同步提高low和assure值(如low设为总内存的1%,assure设为2%)以避免不必要的内存回收抖动", ++ "range": [ ++ "12582912 16777216 25165824", ++ "25165824 33554432 50331648", ++ "50331648 100663296" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.udp_mem=$param_value", ++ "get": "sysctl -n net.ipv4.udp_mem" ++ }, ++ "net.ipv4.tcp_mem": { ++ "desc": "1. 当系统在高并发TCP连接场景下出现内存不足或频繁触发OOM killer时,应适当增加三个值(最小压力值/压力阈值/最大值),建议按总物理内存的1%-3%计算,并确保最大值不超过系统可用内存的50%\n\n2. 若系统出现TCP性能下降或连接被拒绝(尤其在高吞吐量场景),需检查当前值是否过小,建议将最小值设为当前活跃连接内存占用的1.5倍,最大值设为系统空闲内存的30%-40%", ++ "range": [ ++ "6168306 8224411 12336612", ++ "12336612 16448822 24673224" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_mem=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_mem" ++ }, ++ "net.ipv4.tcp_rmem": { ++ "desc": "1. 在高吞吐量网络环境中(如视频流服务器、大数据传输节点),当默认最大值6291456(6MB)导致TCP接收窗口成为瓶颈时,建议将第三个值调整为16777216(16MB)以提升吞吐量\n\n2. 对于内存资源受限的服务器(如云主机或容器环境),若默认值87380(85KB)的初始缓冲区导致内存压力,可将中间值降至65536(64KB)以平衡性能与资源消耗", ++ "range": [ ++ "4096 16384 4194304", ++ "4096 32768 8388608", ++ "4096 65536 16777216" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_rmem=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_rmem" ++ }, ++ "net.ipv4.tcp_wmem": { ++ "desc": "1. 在高吞吐量网络环境中(如视频流服务器、文件传输服务器),建议将参数调整为 4096 65536 16777216,以提升大流量场景下的TCP写缓冲区性能\n\n2. 对于内存资源受限的服务器(如云主机或容器环境),若出现内存压力时应适当降低最大值(如调整为 4096 32768 8388608),避免TCP写缓冲区占用过多内存", ++ "range": [ ++ "4096 16384 4194304", ++ "4096 32768 8388608", ++ "4096 65536 16777216" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_wmem=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_wmem" ++ }, ++ "net.ipv4.tcp_fastopen": { ++ "desc": "1. 当服务器主要处理大量短连接请求(如HTTP API服务)且需要降低TCP握手延迟时,建议启用该参数(值为3同时支持客户端和服务器端)\n\n2. 当服务器处于严格安全环境或处理敏感数据时,建议禁用该参数(值为0)以避免潜在的安全风险", ++ "range": [ ++ "1", ++ "2", ++ "4" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_fastopen=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_fastopen" ++ }, ++ "net.ipv4.tcp_synack_retries": { ++ "desc": "1. 当服务器处于高并发连接场景且出现大量SYN_RECV状态连接时,若网络延迟较高,可适当增加该值(默认5)至7-10次,确保在拥塞环境下完成三次握手\n\n2. 若服务器遭受SYN Flood攻击或处于高负载状态,可降低该值至2-3次以快速释放半连接资源,减少SYN队列占用时间", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 64 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_synack_retries=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_synack_retries" ++ }, ++ "net.ipv4.tcp_syn_retries": { ++ "desc": "1. 在延迟较高或不稳定的网络环境中(如跨国网络或移动网络),建议将默认值6适当增加到8-10,以应对可能出现的SYN丢包情况,但需注意这会延长连接建立失败时的等待时间\n\n2. 对于内网或低延迟高可靠网络环境,建议降低到3-4以减少连接建立超时等待时间,提高应用响应速度", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 64 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_syn_retries=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_syn_retries" ++ }, ++ "net.ipv4.tcp_moderate_rcvbuf": { ++ "desc": "1. 当应用需要处理大量突发流量时,建议启用该参数(设置为1),系统会自动调整接收缓冲区大小以适应流量变化\n2. 在内存资源受限的环境中,建议禁用该参数(设置为0),避免系统自动扩大接收缓冲区导致内存压力增加", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_moderate_rcvbuf=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_moderate_rcvbuf" ++ }, ++ "net.ipv4.tcp_timestamps": { ++ "desc": "1. 在存在NAT设备或负载均衡器的网络环境中,建议禁用该参数(设置为0),以避免可能的时间戳冲突导致的连接问题\n2. 在高速低延迟的内网环境中,建议启用该参数(设置为1),以获得更精确的RTT计算和更好的TCP性能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_timestamps=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_timestamps" ++ }, ++ "net.ipv4.tcp_dsack": { ++ "desc": "1. 在延迟敏感型应用环境中(如高频交易系统),建议设置为0(禁用)以减少不必要的ACK确认包带来的网络开销\n\n2. 在常规Web服务或文件传输场景下保持默认值1(启用),可帮助处理网络丢包情况下的数据重传效率", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_dsack=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_dsack" ++ }, ++ "net.ipv4.tcp_fack": { ++ "desc": "1. 在存在高延迟或高丢包率的网络环境中,建议启用该参数以改善TCP重传性能,通过选择性确认减少不必要的重传\n\n2. 当服务器作为高性能网络应用(如视频流、大文件传输)的接收端时,建议保持启用状态以优化TCP拥塞控制机制", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_fack=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_fack" ++ }, ++ "net.ipv4.tcp_sack": { ++ "desc": "1. 在广域网(WAN)通信环境下应保持启用(1),可显著改善高延迟或丢包网络中的TCP传输性能,即使会略微增加CPU负载\n2. 在低延迟、高带宽的局域网(LAN)环境中可考虑禁用(0),特别是当系统CPU资源已高度饱和且网络质量极佳时", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_sack=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_sack" ++ }, ++ "net.ipv4.tcp_low_latency": { ++ "desc": "在高吞吐量且对延迟敏感的集群环境(如Beowulf集群)中应启用(设置为1) \n在普通网络环境下保持禁用(设置为0)以避免不必要的开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_low_latency=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_low_latency" ++ }, ++ "net.ipv4.tcp_adv_win_scale": { ++ "desc": "1. 当应用需要处理大量小包网络流量时,若观察到TCP接收窗口利用率不足,可考虑增大该值(如设为2或3),以减少缓冲区开销比例,提升小包传输效率\n\n2. 在内存资源紧张的服务器环境中,若发现TCP内存消耗过高导致系统频繁OOM,可适当降低该值(如设为1或0),增加缓冲区开销比例以降低内存使用量", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_adv_win_scale=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_adv_win_scale" ++ }, ++ "net.ipv4.route.max_size": { ++ "desc": "1. 当服务器频繁处理大量网络连接或作为路由器转发大量数据包时,若观察到路由缓存频繁刷新导致性能下降,可适当增加该值(默认值4096可逐步倍增测试)\n\n2. 在高内存压力环境下,若路由表占用内存过高影响其他服务,且实际活跃路由条目远低于当前设置值,可适当降低该值以释放内存", ++ "type": "continuous", ++ "range": [ ++ 67108864, ++ 2080374784 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.route.max_size=$param_value", ++ "get": "sysctl -n net.ipv4.route.max_size" ++ }, ++ "net.ipv4.tcp_max_tw_buckets": { ++ "desc": "1. 当服务器出现大量TIME_WAIT状态的TCP连接导致端口耗尽或性能下降时,建议将net.ipv4.tcp_max_tw_buckets从默认值2048调整为360000。\n\n2. 在高并发短连接场景下,若监控发现TIME_WAIT连接数频繁达到上限,可适当增大该值至360000以提升连接处理能力。", ++ "type": "continuous", ++ "range": [ ++ 32768, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_max_tw_buckets=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_max_tw_buckets" ++ }, ++ "net.ipv4.tcp_max_syn_backlog": { ++ "desc": "1. 当服务器频繁处理大量新连接请求且出现 SYN 包丢弃时,应考虑增大该值至 8192 或更高\n\n2. 在高并发短连接场景下,若监控发现 SYN_RECV 状态连接数常接近默认值 2048,应调整该参数以避免连接建立延迟", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 262144 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_max_syn_backlog=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_max_syn_backlog" ++ }, ++ "net.ipv4.tcp_max_orphans": { ++ "desc": "1. 当服务器频繁出现\"Out of socket memory\"错误或日志中出现大量orphaned sockets警告时,需要增加该值。建议根据当前系统内存容量调整,通常设置为内存容量的1/4对应的socket数量(每个orphan约占用64KB内存)\n\n2. 对于高并发短连接服务(如HTTP服务器、负载均衡器),若观察到tcp_max_orphans限制成为性能瓶颈(通过监控/proc/net/sockstat中orphan数量接近上限),应适当调高该值至并发连接数的1.2-1.5倍", ++ "type": "continuous", ++ "range": [ ++ 65536, ++ 16777216 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_max_orphans=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_max_orphans" ++ }, ++ "net.ipv4.tcp_ecn": { ++ "desc": "1. 当网络中存在不支持ECN的老旧网络设备时,建议保持默认值0(禁用),以避免潜在的数据包丢弃问题\n\n2. 在确认网络设备完全支持ECN且需要降低TCP重传率的环境中,建议设置为1(启用)以获得更好的拥塞控制性能", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_ecn=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_ecn" ++ }, ++ "net.ipv4.ip_forward": { ++ "desc": "- 当服务器需要作为路由器或VPN网关时,应设置为1以启用IPv4转发功能\n- 当服务器仅作为终端主机使用时,应保持默认值0以禁用转发功能,减少潜在安全风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.ip_forward=$param_value", ++ "get": "sysctl -n net.ipv4.ip_forward" ++ }, ++ "net.ipv4.conf.default.rp_filter": { ++ "desc": "1. 当服务器作为路由器或需要处理多路径网络流量时,建议将rp_filter设置为2(宽松模式),以避免严格的反向路径验证导致合法流量被丢弃\n2. 在单网卡服务器且网络环境可信的情况下,可以设置为0(关闭验证)以减少内核处理开销,但需确保网络环境无IP欺骗风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.default.rp_filter=$param_value", ++ "get": "sysctl -n net.ipv4.conf.default.rp_filter" ++ }, ++ "net.ipv4.tcp_no_metrics_save": { ++ "desc": "1. 在高并发短连接场景下,建议设置为1以禁用TCP连接参数保存,避免因大量无效参数缓存导致内存浪费和性能下降\n2. 在需要保持长连接稳定性的场景下,建议保持默认值0,允许重用之前连接的有效参数来优化新连接建立性能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_no_metrics_save=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_no_metrics_save" ++ }, ++ "net.ipv4.ip_default_ttl": { ++ "desc": "1. 当网络中存在多层NAT或复杂路由环境时,若出现数据包提前被丢弃的情况,可考虑将TTL值提高到128,确保数据包能到达更远的网络节点\n\n2. 对于需要限制数据包传播范围的场景(如内部测试网络),可降低TTL值至32以下,防止数据包在网络中过度传播", ++ "type": "continuous", ++ "range": [ ++ 8, ++ 128 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.ip_default_ttl=$param_value", ++ "get": "sysctl -n net.ipv4.ip_default_ttl" ++ }, ++ "net.ipv4.ip_no_pmtu_disc": { ++ "desc": "1. 当网络中存在路径MTU发现(PMTUD)问题导致连接超时或性能下降时,建议将该参数设为1以禁用PMTUD,避免因ICMP黑洞或防火墙丢弃数据包导致的连接问题\n\n2. 在高速网络环境(如10Gbps以上)且网络设备可靠支持PMTUD时,建议保持默认值0以启用PMTUD,确保TCP能动态发现最优MTU值提升吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.ip_no_pmtu_disc=$param_value", ++ "get": "sysctl -n net.ipv4.ip_no_pmtu_disc" ++ }, ++ "net.ipv4.tcp_retries2": { ++ "desc": "1. 对于高延迟或不稳定网络环境中的服务器,可考虑将值降低到5-8,减少因网络临时故障导致的连接长时间挂起问题\n2. 对于需要快速检测连接失效的金融交易类服务器,建议设置为3-5,确保能更快释放失效连接资源", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 30 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_retries2=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_retries2" ++ }, ++ "net.ipv4.tcp_orphan_retries": { ++ "desc": "1. 当服务器面临大量半连接(orphaned sockets)导致资源占用过高时,可适当降低该值(如3-5),加速释放资源,但需注意过低可能导致正常长延迟网络下的连接被过早丢弃\n\n2. 若服务器主要处理本地或低延迟网络通信,且出现过多重试浪费资源的情况,可降至2-3次以减少不必要的等待时间", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 15 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_orphan_retries=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_orphan_retries" ++ }, ++ "net.ipv4.tcp_syncookies": { ++ "desc": "1. 当服务器频繁遭受SYN flood攻击时,应启用该参数(设置为1),以保护系统资源不被耗尽\n2. 在正常网络环境下且未遭受攻击时,建议保持默认值(通常为1),因为启用syncookies可能导致TCP连接性能略微下降", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_syncookies=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_syncookies" ++ }, ++ "net.ipv4.tcp_reordering": { ++ "desc": "1. 当网络路径存在较高丢包率或频繁重排序时,若观察到TCP重传率明显上升且吞吐量下降,应考虑适当增大该值(默认3可尝试调整为9-12),以容忍更多乱序数据包而非错误触发快速重传\n\n2. 在低延迟网络环境(如数据中心内部)且使用TSO/GRO等卸载技术时,若内核日志频繁出现\"TCP: too many of order packets\"警告,可将该值适度降低(如调整为6-8),减少乱序队列内存占用", ++ "type": "continuous", ++ "range": [ ++ 2, ++ 10 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_reordering=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_reordering" ++ }, ++ "net.ipv4.tcp_retrans_collapse": { ++ "desc": "1. 在Linux服务器环境中,若确认无老旧打印机设备需要兼容,建议禁用此参数以优化TCP重传性能\n\n2. 当网络吞吐量出现异常下降且排查其他因素无果时,可尝试禁用此参数观察是否由打印机兼容性功能引起", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_retrans_collapse=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_retrans_collapse" ++ }, ++ "net.ipv4.tcp_congestion_control": { ++ "desc": "1. 在高带宽、高延迟网络环境下(如跨数据中心通信),建议将默认的\"cubic\"算法切换为\"bbr\",可更充分利用带宽并减少排队延迟\n\n2. 在无线网络或移动网络环境中,若出现频繁丢包,建议使用\"vegas\"或\"westwood\"算法,这些算法对丢包区分更准确,能避免误判拥塞", ++ "range": [ ++ "cubic", ++ "reno", ++ "bbr" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.tcp_congestion_control=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_congestion_control" ++ }, ++ "net.ipv4.conf.default.promote_secondaries": { ++ "desc": "1. 当服务器需要保持高可用性且依赖多个IP地址时,建议设置为1,确保主IP被移除时次IP能自动提升为主IP,避免服务中断\n\n2. 在安全性要求严格的环境中建议设置为0,确保主IP被移除时所有关联IP都被清除,防止潜在的安全风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.default.promote_secondaries=$param_value", ++ "get": "sysctl -n net.ipv4.conf.default.promote_secondaries" ++ }, ++ "net.ipv4.conf.all.promote_secondaries": { ++ "desc": "1. 当服务器需要保持高可用性且依赖多个IP地址时,建议设置为1,以确保主IP被移除时次IP能自动提升为主IP,避免服务中断\n\n2. 当服务器IP地址管理需要严格遵循变更控制流程时,建议保持默认值0,以确保任何IP地址变更都需要明确操作,防止意外配置变更", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.all.promote_secondaries=$param_value", ++ "get": "sysctl -n net.ipv4.conf.all.promote_secondaries" ++ }, ++ "net.ipv4.conf.all.accept_redirects": { ++ "desc": "1. 对于作为网关或路由器的Linux服务器,建议设置为0以禁用ICMP重定向消息,防止潜在的网络拓扑欺骗攻击\n2. 对于普通主机服务器,若网络环境可信且需要ICMP重定向功能优化路由,可保持默认值1;否则建议设置为0增强安全性", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.all.accept_redirects=$param_value", ++ "get": "sysctl -n net.ipv4.conf.all.accept_redirects" ++ }, ++ "net.ipv4.conf.default.accept_redirects": { ++ "desc": "1. 在作为路由器使用时,建议设置为0以禁用ICMP重定向消息,防止潜在的网络拓扑混淆和安全风险\n2. 在作为终端主机使用时,可保持默认值1以接受重定向消息,但若网络环境安全要求较高,建议同样设置为0", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.default.accept_redirects=$param_value", ++ "get": "sysctl -n net.ipv4.conf.default.accept_redirects" ++ }, ++ "net.ipv4.conf.all.secure_redirects": { ++ "desc": "- 在安全要求较高的生产环境中,建议设置为0禁用,避免潜在的安全风险\n- 若网络环境需要接收特定ICMP重定向且信任网关,可设置为1但需配合其他安全措施", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.all.secure_redirects=$param_value", ++ "get": "sysctl -n net.ipv4.conf.all.secure_redirects" ++ }, ++ "net.ipv4.conf.default.secure_redirects": { ++ "desc": "1. 在安全要求较高的生产环境中建议设置为0,防止潜在的网络重定向攻击\n2. 如果服务器需要接收来自可信网关的ICMP重定向消息以优化路由,可设置为1", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.default.secure_redirects=$param_value", ++ "get": "sysctl -n net.ipv4.conf.default.secure_redirects" ++ }, ++ "net.ipv4.icmp_echo_ignore_broadcasts": { ++ "desc": "1. 如果服务器处于可能遭受ICMP广播风暴攻击的网络环境(如公开网络或DMZ区域),建议设置为1以避免资源耗尽\n\n2. 如果服务器位于受保护的内网且需要接收ICMP广播(如网络设备发现等场景),建议设置为0以保持功能正常", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.icmp_echo_ignore_broadcasts=$param_value", ++ "get": "sysctl -n net.ipv4.icmp_echo_ignore_broadcasts" ++ }, ++ "net.nf_conntrack_max": { ++ "desc": "1. 当服务器处理大量网络连接(如高并发代理、NAT网关或防火墙)且频繁出现\"nf_conntrack: table full\"日志时,需要增加该值以避免连接跟踪表溢出\n\n2. 当系统内存不足且连接跟踪表利用率持续低于50%时,可适当降低该值以释放内存资源", ++ "type": "continuous", ++ "range": [ ++ 65536, ++ 1048576 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.nf_conntrack_max=$param_value", ++ "get": "sysctl -n net.nf_conntrack_max" ++ }, ++ "net.netfilter.nf_conntrack_tcp_timeout_established": { ++ "desc": "1. 当服务器处理大量持久TCP连接(如长连接服务、代理服务器等)且观察到nf_conntrack表频繁满导致丢包时,可适当增大该值(默认43200秒/12小时),但需确保不超过客户端实际连接保持时间,避免无效连接占用资源\n\n2. 对于短连接为主的Web服务器环境,若系统内存压力较大且连接跟踪表占用过高,可适当降低该值(但不应小于1800秒),以加速连接表项回收", ++ "type": "continuous", ++ "range": [ ++ 108000, ++ 1728000 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.netfilter.nf_conntrack_tcp_timeout_established=$param_value", ++ "get": "sysctl -n net.netfilter.nf_conntrack_tcp_timeout_established" ++ }, ++ "net.netfilter.nf_conntrack_tcp_timeout_close_wait": { ++ "desc": "1. 当服务器处理大量短连接且频繁出现close_wait状态时,若该值过大(默认240秒)会导致连接资源长时间占用,可适当降低至60-120秒范围\n\n2. 对于长连接为主的服务器环境,若发现连接异常断开导致资源泄漏,可考虑增大该值至300-600秒范围", ++ "type": "continuous", ++ "range": [ ++ 15, ++ 240 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.netfilter.nf_conntrack_tcp_timeout_close_wait=$param_value", ++ "get": "sysctl -n net.netfilter.nf_conntrack_tcp_timeout_close_wait" ++ }, ++ "net.netfilter.nf_conntrack_tcp_timeout_fin_wait": { ++ "desc": "1. 当服务器处理大量短连接且频繁出现FIN_WAIT状态连接堆积时,若系统日志显示nf_conntrack表频繁满导致丢包,可适当减少该值至30-60秒范围以加速连接回收\n\n2. 若服务器主要处理长连接且并发连接数远低于nf_conntrack_max的80%,出现FIN_WAIT状态连接过早超时导致异常断开时,可增大该值至120-300秒范围确保正常连接终止流程完成", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 480 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.netfilter.nf_conntrack_tcp_timeout_fin_wait=$param_value", ++ "get": "sysctl -n net.netfilter.nf_conntrack_tcp_timeout_fin_wait" ++ }, ++ "net.netfilter.nf_conntrack_tcp_timeout_time_wait": { ++ "desc": "1. 当服务器处理大量短连接且出现大量TIME_WAIT状态连接导致nf_conntrack表满时,可适当降低该值(默认120秒),建议调整为30-60秒以更快释放连接跟踪条目\n\n2. 若服务器作为反向代理或负载均衡器且出现端口耗尽问题,在确认无重传包风险后可考虑将该值降至15-30秒,但需确保大于TCP的2MSL时间", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 480 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.netfilter.nf_conntrack_tcp_timeout_time_wait=$param_value", ++ "get": "sysctl -n net.netfilter.nf_conntrack_tcp_timeout_time_wait" ++ }, ++ "net.ipv4.conf.default.forwarding": { ++ "desc": "1. 当服务器需要作为路由器或网关转发IPv4流量时,应将该参数设置为1,否则保持默认值0以关闭转发功能提升安全性\n2. 在容器或虚拟化环境中,若宿主机需要为虚拟机/容器提供网络转发功能,则需启用该参数", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string", ++ "set": "sysctl -w net.ipv4.conf.default.forwarding=$param_value", ++ "get": "sysctl -n net.ipv4.conf.default.forwarding" ++ }, ++ "net.core.rps_sock_flow_entries": { ++ "desc": "1. 当服务器处理大量网络连接且RPS/RFS功能开启时,若出现CPU缓存命中率下降或网络延迟增加,应考虑增加该值(通常建议设置为32768或65536)\n\n2. 在高吞吐量网络环境下(如10Gbps以上),若网络性能未达预期且/proc/net/softnet_stat显示drop计数增长,应将该值调整为至少等于或大于预期并发连接数", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 131072 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.core.rps_sock_flow_entries=$param_value", ++ "get": "sysctl -n net.core.rps_sock_flow_entries" ++ }, ++ "net.ipv4.tcp_min_tso_segs": { ++ "desc": "1. 当服务器主要处理大量小数据包(如小于1460字节)且TSO利用率低时,可适当降低该值(默认2)以减少延迟,但需确保不低于1以避免性能下降\n\n2. 对于高速网络(10Gbps+)且处理大数据传输的场景,若观察到TSO分段不足导致CPU利用率过高,可适当增大该值(建议不超过8)以提升吞吐量", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 16 ++ ], ++ "dtype": "int", ++ "set": "sysctl -w net.ipv4.tcp_min_tso_segs=$param_value", ++ "get": "sysctl -n net.ipv4.tcp_min_tso_segs" ++ } ++} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/optimize/README.md b/copilot-tune/src/knowledge_base/optimize/README.md +new file mode 100644 +index 0000000..7edf875 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/optimize/README.md +@@ -0,0 +1 @@ ++性能优化知识库 +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/optimize/parameter/mysql.jsonl b/copilot-tune/src/knowledge_base/optimize/parameter/mysql.jsonl +new file mode 100644 +index 0000000..dbd6d1f +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/optimize/parameter/mysql.jsonl +@@ -0,0 +1,22 @@ ++{"param_name": "mysql.innodb_io_capacity", "content": "innodb_io_capacity参数通常在以下情况需要调整:当系统的磁盘I/O速度低于MySQL处理请求的速度,导致InnoDB缓存命中率下降,或者在数据库负载大幅增加、并发读写操作增多时,为了优化磁盘I/O性能和减少锁竞争,需要适当增大此值。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_thread_concurrency", "content": "当系统并发用户量大幅度增加,导致MySQL InnoDB引擎的事务处理压力增大,CPU利用率过高或者响应时间变长时,可能需要调整innodb_thread_concurrency参数以提高并发事务处理能力。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_lru_scan_depth", "content": "innodb_lru_scan_depth参数在MySQL InnoDB缓存中用于控制并发读取时对最近最少使用(LRU)页面的扫描范围。当系统并发读请求较高,导致InnoDB缓存命中率下降,频繁触发LRU替换操作,可能会引发锁定争抢或性能瓶颈时,需要调整此参数以提高查询效率,避免过多的磁盘I/O。如果发现查询性能瓶颈、锁定等待增加或者系统资源利用率不高,应考虑增大innodb_lru_scan_depth以减少扫描次数。但需谨慎调整,过大可能导致内存消耗增加。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_adaptive_hash_index", "content": "当系统处理大量具有重复键值的数据查询,且这些查询对性能有较高要求时,可能需要调整innodb_adaptive_hash_index参数以优化InnoDB引擎的哈希索引性能。如果发现全表扫描效率低下,或者并发读取频繁时,考虑增大此参数以提升查询速度。然而,过度调整可能导致内存消耗增加,因此需根据实际负载进行权衡。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_max_dirty_pages_pct", "content": "当MySQL InnoDB存储引擎的脏页数量超过innodb_max_dirty_pages_pct(InnoDB最大脏页百分比)所设定的阈值时,为了保持良好的缓存性能和数据一致性,可能需要调整此参数。如果服务器频繁达到脏页清理限制,或者观察到查询性能下降、宕机风险增加,应考虑增大此值以提高I/O效率,但必须谨慎,因为过高可能导致更多的磁盘I/O和写操作,从而影响整体系统性能。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_buffer_pool_instances", "content": "当并发用户量大幅增加,导致InnoDB写操作压力增大,或者服务器硬件升级后内存资源增多,为了提高并发事务处理能力,可能需要调整innodb_buffer_pool_instances参数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_flush_log_at_trx_commit", "content": "当系统负载较高,频繁事务提交导致日志写入压力增大,或者为了提高数据可靠性而希望实时持久化日志时,可能需要调整innodb_flush_log_at_trx_commit参数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_adaptive_max_sleep_delay", "content": "innodb_adaptive_max_sleep_delay参数在MySQL InnoDB引擎中用于控制InnoDB在执行锁定表操作时,如果遇到长时间的锁竞争,会暂停一段时间再尝试获取锁。当系统并发高、锁竞争频繁导致查询响应延迟或者服务器性能瓶颈出现时,可能需要调整此参数以优化锁等待策略,避免资源过度占用。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_spin_wait_delay", "content": "当MySQL的InnoDB引擎遇到高并发写操作,且磁盘I/O延迟较高,导致spinlock竞争加剧时,可能需要调整innodb_spin_wait_delay参数以优化锁定性能,减少CPU空转时间。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_log_buffer_size", "content": "当系统处理大量事务、频繁写入操作导致日志文件频繁写入磁盘,影响性能时,或者磁盘I/O速度成为瓶颈, Innodb_log_buffer_size 需要调整以提高写入缓存效率,减少磁盘IO次数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.thread_cache_size", "content": "当MySQL服务器处理高并发连接时,如果thread_cache_size(线程缓存大小)设置过小,频繁创建和销毁线程会增加CPU开销;反之,如果设置过大但并发连接不多,会浪费内存资源。因此,当系统并发连接数波动较大或者服务器资源充足且希望减少线程创建/销毁操作时,可能需要调整thread_cache_size参数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_write_io_threads", "content": "当系统并发写入负载较高,导致InnoDB引擎的写入操作响应变慢,或者磁盘I/O成为性能瓶颈时,可能需要调整innodb_write_io_threads参数以优化MySQL的写入线程处理能力。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_change_buffering", "content": "当MySQL InnoDB存储引擎的innodb_change_buffering参数导致查询性能下降,如频繁的小范围插入或更新操作导致redo log写入过于频繁,或者系统内存充足且希望提高数据缓存命中率时,应考虑调整此参数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_sync_spin_loops", "content": "innodb_sync_spin_loops参数通常在MySQL InnoDB引擎中用于控制在等待磁盘I/O同步时的自旋循环次数。当数据库服务器负载较高,频繁进行大容量数据写入且磁盘IO速度较慢,可能导致写操作阻塞,此时可能需要调整此参数以提高写入性能。如果设置过低,可能会增加CPU使用;设置过高则可能导致数据完整性风险。在监控到大量写操作延时并且确认磁盘瓶颈存在的情况下,可以考虑适当增加innodb_sync_spin_loops的值,但需谨慎,因为它与数据同步的可靠性是一对矛盾。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.sync_binlog", "content": "1. 高并发写入环境:磁盘利用率过高,磁盘I/O性能为系统瓶颈,出现读负载较高的情况时可能需要调整。\n2. 数据库响应时间要求高:减少同步频率可以减少I/O操作,提升查询响应速度,但增加了一定的数据一致性风险。\n3. 服务器资源紧张:如果磁盘I/O或网络带宽有限,可能导致sync_binlog过程阻塞,此时可能需要减小同步频率以缓解压力。\n", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_read_io_threads", "content": "当系统负载高、并发读请求频繁,且发现InnoDB的磁盘I/O性能成为瓶颈时,可能需要调整innodb_read_io_threads参数以提高读取并发能力,减少响应延迟。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_max_dirty_pages_pct_lwm", "content": "当MySQL InnoDB引擎的脏页数量超过innodb_max_dirty_pages_pct_lwm(InnoDB最小脏页面百分比)所设定的阈值时,可能会影响数据库性能。如果频繁出现高内存占用或者查询响应变慢,应考虑调整此参数以优化内存管理,防止长时间的刷脏页到磁盘,减小I/O压力。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_log_files_in_group", "content": "当数据库服务器负载较高,频繁进行事务操作导致日志文件快速增长,或者磁盘空间不足时,可能需要调整innodb_log_files_in_group参数以提高写入性能或防止单个日志文件过大引发的问题。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_log_file_size", "content": "当MySQL InnoDB存储引擎的事务日志文件(innodb_log_file_size)过大或频繁地导致满,影响数据库写操作性能,或者磁盘空间不足时,需要调整此参数以优化磁盘I/O压力和提高事务处理能力。如果日志频繁切换,可能由于事务量增加、数据更新频繁或设置过小导致,这时也需要增大日志文件大小。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.tmp_table_size", "content": "当系统内存资源充足,且数据库频繁创建临时表以执行复杂查询,导致磁盘I/O压力增大,或者希望提高InnoDB表的性能时,可能需要调整MySQL的tmp_table_size参数。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_page_cleaners", "content": "当MySQL InnoDB存储引擎的innodb_page_cleaners参数值导致内存压力增大、查询性能下降或者磁盘I/O频繁时,可能需要调整该参数以优化表页清理策略,提高缓存效率。", "boottlneck_type": "mysql"} ++{"param_name": "mysql.innodb_adaptive_flushing_lwm", "content": "innodb_adaptive_flushing_lwm参数在MySQL InnoDB存储引擎中用于控制自适应flushing的最低写入量阈值。当InnoDB缓存中的脏页数量低于此阈值时,它会触发更频繁的flush操作以保持数据一致性。如果数据库频繁写入且I/O延迟较高,或者为了优化硬盘I/O利用,可能需要增加此参数值;相反,如果系统I/O资源充足或追求低延迟读性能,应降低此值以减少不必要的flush操作。", "boottlneck_type": "mysql"} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/optimize/parameter/spark.jsonl b/copilot-tune/src/knowledge_base/optimize/parameter/spark.jsonl +new file mode 100644 +index 0000000..054c11e +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/optimize/parameter/spark.jsonl +@@ -0,0 +1,9 @@ ++{"param_name": "sql.files.maxPartitionBytes", "content": "在处理大数据文件时,如果Spark任务由于单个partition过大导致内存溢出或者性能下降,或者文件拆分不均衡影响并行计算效率时,需要进行调整。", "boottlneck_type": "spark"} ++{"param_name": "dynamicAllocation.maxExecutors", "content": "当Spark集群的资源利用率不均衡,任务处理速度波动较大,或者在应对大规模数据处理时需要提升并发执行能力,动态分配的最大执行器数量可能需要调整以优化工作负载分布和提高整体性能。", "boottlneck_type": "spark"} ++{"param_name": "executor.cores", "content": "当Spark应用程序在处理大规模数据或者并发任务时出现性能瓶颈,如执行时间过长、资源利用率低或者响应速度慢,以根据集群硬件资源和工作负载动态分配核心数,提高任务并行处理能力。", "boottlneck_type": "spark"} ++{"param_name": "sql.adaptive.maxNumPostShufflePartitions", "content": "当Spark在处理大规模数据集或者执行复杂的SQL查询时,如果发现任务分解后的post-shuffle分区过多导致内存消耗过大或性能瓶颈,或者由于网络带宽限制引起的数据传输延迟。", "boottlneck_type": "spark"} ++{"param_name": "executor.memory", "content": "需要调整的情况包括:当系统内存资源不足,导致Spark任务执行缓慢或者频繁触发内存溢出异常;处理大规模数据时,为了优化任务并行性和效率,提升计算性能;或者在分布式集群中,根据节点的实际内存大小动态调整,以充分利用硬件资源。", "boottlneck_type": "spark"} ++{"param_name": "executor.memoryOverhead", "content": "在Spark应用中,当任务执行器内存消耗频繁超过预期,导致内存溢出或者性能瓶颈时,需要调整以优化内存利用率,减少垃圾回收压力,提高整体集群效率。", "boottlneck_type": "spark"} ++{"param_name": "driver.cores", "content": "driver.cores参数在以下情况需要调整:当Spark应用的Driver进程需要处理大量数据或者并发任务时,为了提高任务调度和执行效率,需要增加可用的CPU核心数,以减少任务等待时间,避免资源瓶颈。然而,如果资源有限或者硬件性能不足,应适当减少此参数以防止内存溢出或性能下降。", "boottlneck_type": "spark"} ++{"param_name": "driver.memory", "content": "当Spark应用程序在运行时遇到内存瓶颈,导致任务失败或者性能下降,例如内存溢出错误、作业执行缓慢,以提供更多的内存资源来优化driver进程的性能。", "boottlneck_type": "spark"} ++{"param_name": "driver.memoryOve", "content": "在Spark应用程序的驱动程序内存消耗超过预期,导致任务调度或数据处理性能下降,内存不足时需要进行调整。", "boottlneck_type": "spark"} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/optimize/parameter/system.jsonl b/copilot-tune/src/knowledge_base/optimize/parameter/system.jsonl +new file mode 100644 +index 0000000..78a11cf +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/optimize/parameter/system.jsonl +@@ -0,0 +1,39 @@ ++{"param_name": "kernel.sched_min_granularity_ns", "content": "kernel.sched_min_granularity_ns参数可能需要调整在以下情况:当系统负载极高,CPU调度粒度过小导致上下文切换频繁,影响性能时;或者在追求更精细的任务调度,减少任务切换延迟的场景下。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_wakeup_granularity_ns", "content": "kernel.sched_wakeup_granularity_ns参数可能需要调整在以下情况:当系统负载极高,频繁的轻量级唤醒导致调度开销增加,或者在对实时性要求严格的任务中,为了提高唤醒效率和任务响应时间时。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_child_runs_first", "content": "当系统负载较高,且CPU资源分配偏向于父进程而非子进程时,可能需要调整kernel.sched_child_runs_first参数以优化子进程的调度,提升整体系统响应速度和并发处理能力。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_latency_ns", "content": "kernel.sched_latency_ns 参数可能需要调整在以下情况:当系统面临高并发负载,导致任务调度延迟增加,影响服务响应时间,或者在需要优化实时性要求高的任务处理时,为了减少任务切换时延,以提升整体系统性能。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_tunable_scaling", "content": "kernel.sched_tunable_scaling 参数可能需要调整在以下情况:当系统的负载变化频繁,导致调度算法效率下降,影响到应用程序响应时间时;多核处理器环境中,为了优化CPU间任务调度的动态平衡;以及在需要最大化系统整体性能,同时避免过度调度导致的CPU利用率不均时。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_migration_cost_ns", "content": "当系统负载较高,频繁发生进程迁移导致性能下降,或者在分布式架构中存在大规模数据传输时,可能需要调整kernel.sched_migration_cost_ns参数以优化调度效率和资源利用。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_nr_migrate", "content": "kernel.sched_nr_migrate参数在系统中处理大量进程迁移,如负载均衡或虚拟机调度时,如果出现性能瓶颈、延迟增加或者资源分配不均,可能需要调整此参数以优化进程迁移效率。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_expire_centisecs", "content": "当系统频繁发生内存交换(脏页过多),导致磁盘I/O压力增大,应用响应速度下降时,可能需要调整vm.dirty_expire_centisecs参数来优化缓存策略,以平衡内存和磁盘的使用效率。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_background_ratio", "content": "当系统内存使用率持续高位,频繁发生内存交换到磁盘(swap-in),导致系统性能下降时,可能需要调整vm.dirty_background_ratio参数以优化内存管理,防止过多的脏页留在内存中影响整体响应速度。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_background_bytes", "content": "当系统内存使用率持续高位,频繁发生页面交换导致磁盘I/O压力增大,应用性能下降时,可能需要调整vm.dirty_background_bytes参数以优化内存管理,减少后台写脏数据到磁盘的行为,从而提升系统响应速度。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_ratio", "content": "当系统频繁发生内存交换(Dirty Pages),导致磁盘I/O压力增大,应用性能下降时,可能需要调整vm.dirty_ratio参数。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_bytes", "content": "当系统频繁发生内存交换(Dirty Bytes,即被写入到交换空间的内存量增加),导致性能下降或磁盘I/O压力增大时,可能需要调整vm.dirty_bytes参数。", "boottlneck_type": "system"} ++{"param_name": "vm.dirty_writeback_centisecs", "content": "当系统频繁发生内存交换(Dirty Pages),导致写回脏数据到磁盘的I/O压力增大,影响整体性能时,可能需要调整vm.dirty_writeback_centisecs参数,以优化数据写回策略,平衡内存使用和I/O负载。", "boottlneck_type": "system"} ++{"param_name": "vm.swappiness", "content": "当系统内存使用率持续较高,但应用程序仍需大量内存时,为了提高虚拟内存的利用效率,可以考虑调整 vm.swappiness 参数以牺牲部分 swap 空间来提升系统响应速度。", "boottlneck_type": "system"} ++{"param_name": "vm.vfs_cache_pressure", "content": "当系统中频繁进行大量文件读写操作,导致虚拟文件系统(VFS)缓存命中率下降,影响整体性能时,可能需要调整vm.vfs_cache_pressure参数以优化磁盘I/O性能。", "boottlneck_type": "system"} ++{"param_name": "vm.min_free_kbytes", "content": "当系统的内存使用率持续高位,导致系统性能下降,频繁触发swap交换,或者应用程序运行时出现内存不足错误(OOM Killer),此时可能需要调整vm.min_free_kbytes参数以保证足够的内存空间供内核预留,以防止因为最小空闲内存阈值过低引发的性能问题。", "boottlneck_type": "system"} ++{"param_name": "vm.watermark_scale_factor", "content": "当系统负载持续高位运行,CPU利用率超过阈值,且内存频繁交换导致性能下降时,可能需要调整vm.watermark_scale_factor参数以优化内存管理和调度。", "boottlneck_type": "system"} ++{"param_name": "vm.max_map_count", "content": "当系统面临大量内存映射文件、动态库加载或大数据处理,导致内存映射表满且影响性能时,需要调整vm.max_map_count参数以提高系统的内存映射能力。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.ip_local_port_range", "content": "当系统面临高并发连接压力,导致本地端口资源紧张,引发性能瓶颈或连接拒绝时,可能需要调整net.ipv4.ip_local_port_range参数以扩大可用的本地端口号范围,从而提高并发处理能力。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_thin_linear_timeouts", "content": "当系统中存在大量短连接且频繁建立与释放,导致TCP连接管理效率下降、服务器资源浪费或者性能瓶颈时,可能需要调整net.ipv4.tcp_thin_linear_timeouts参数。", "boottlneck_type": "system"} ++{"param_name": "net.core.somaxconn", "content": "当系统处理大量并发连接请求时,如果发现队列中的连接数持续增长,导致服务器响应变慢或者出现拒绝服务(Denial of Service, DoS)风险,就需要考虑调整net.core.somaxconn参数,以限制同时接受的最大连接数,以优化系统资源管理和防止资源耗尽。", "boottlneck_type": "system"} ++{"param_name": "net.core.rps_sock_flow_entries", "content": "当系统处理高并发连接,网络I/O瓶颈明显,且需要优化socket流控制以提高处理速度时,可能需要调整net.core.rps_sock_flow_entries参数。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_max_tw_buckets", "content": "当系统中存在大量TIME_WAIT状态的TCP连接,导致网络拥塞或者资源浪费时,可能需要调整net.ipv4.tcp_max_tw_buckets参数以优化套接字超时和重用策略。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_fin_timeout", "content": "当系统中存在大量TCP连接且频繁出现FIN(结束)包但超时未被正常关闭的情况,或者应用程序响应延迟,可能导致资源浪费时,可能需要调整net.ipv4.tcp_fin_timeout参数。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_sack", "content": "当系统中存在频繁的数据包丢失或者网络拥塞,导致应用性能下降,且使用TCP协议进行通信时,可能需要调整net.ipv4.tcp_sack参数以优化数据传输的可靠性和效率。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_tw_reuse", "content": "当系统中存在大量快速连接-断开(TIME_WAIT)状态的TCP连接,导致端口复用效率降低,影响服务性能或资源利用率时,可能需要调整net.ipv4.tcp_tw_reuse参数。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_max_syn_backlog", "content": "当系统面临高并发连接请求,导致TCP SYN队列堆积,引起网络拥塞或连接拒绝时,需要调整net.ipv4.tcp_max_syn_backlog参数以提高系统处理新连接的能力。", "boottlneck_type": "system"} ++{"param_name": "net.ipv4.tcp_synack_retries", "content": "当系统面临高并发连接请求,且出现频繁的SYN-ACK重传失败,导致连接建立速度变慢或者连接数受限时,可能需要调整net.ipv4.tcp_synack_retries参数以优化TCP连接处理性能。", "boottlneck_type": "system"} ++{"param_name": "vm.stat_interval", "content": "当系统负载高、频繁进行大量数据读写操作或者需要实时监控性能时,可能需要调整vm.stat_interval参数以提高系统资源统计的频率,以获得更及时的性能指标。", "boottlneck_type": "system"} ++{"param_name": "kernel.shmall", "content": "当系统内存资源紧张,进程间共享内存区域不足,导致系统性能下降或者出现段错误(segmentation fault)时,可能需要调整kernel.shmall参数以增大初始共享内存区大小。", "boottlneck_type": "system"} ++{"param_name": "kernel.shmmax", "content": "当系统内存资源紧张,进程间共享内存频繁且可能导致oom(out of memory)异常时,需要调整kernel.shmmax参数以限制共享内存的最大大小,以防止内存消耗过高影响整体系统稳定性。", "boottlneck_type": "system"} ++{"param_name": "kernel.shmmni", "content": "当系统处理大量并发线程或者内存密集型应用时,kernel.shmall和kernel.shmmni(共享内存的最大进程数)可能会达到上限,导致内存分配效率下降或出现错误,此时需要适当增加kernel.shmmni以优化内存管理,提高系统性能。", "boottlneck_type": "system"} ++{"param_name": "vm.overcommit_ratio", "content": "当系统资源紧张,内存被过度分配给其他进程导致性能下降,或者需要优化内存使用效率以提升虚拟内存的利用率时,可能需要调整vm.overcommit_ratio参数。", "boottlneck_type": "system"} ++{"param_name": "vm.zone_reclaim_mode", "content": "当系统资源紧张,内存分配频繁导致Zone(内存区域)不均衡,或者需要优化内存回收效率以提升整体性能时,可能需要调整vm.zone_reclaim_mode 参数。", "boottlneck_type": "system"} ++{"param_name": "vm.numa_stat", "content": "当系统中存在多核CPU架构并且中间件运行在非统一内存访问(NUMA)环境中,为了优化线程调度和内存访问效率,提高整体性能,可能需要调整vm.numa_stat参数。", "boottlneck_type": "system"} ++{"param_name": "vm.drop_caches", "content": "当系统内存压力大,频繁交换数据到磁盘导致性能下降时,可能需要调整vm.drop_caches参数来释放部分驻留内存量,通过主动清空部分页面缓存以缓解压力,提高系统响应速度。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_cfs_bandwidth_slice_us", "content": "kernel.sched_cfs_bandwidth_slice_us参数可能需要调整在以下情况:当系统负载较高,CPU分配给各个进程的时间片不均衡,导致某些服务响应延迟或者性能瓶颈出现时;或者在需要优化特定服务的CPU使用率,提升其运行效率,以保证整体系统资源利用率时。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_rt_runtime_us", "content": "当系统中的实时任务对响应时间有极高的要求,如嵌入式设备上的实时控制任务,或者在高并发场景下需要确保关键服务的低延迟,kernel.sched_rt_runtime_us 参数可能需要调整以优化实时任务的执行时间片,以保证其服务质量。", "boottlneck_type": "system"} ++{"param_name": "kernel.sched_autogroup_enabled", "content": "当系统负载较高,存在频繁的进程调度导致性能瓶颈,且发现与CPU亲和性相关的任务调度问题时,可能需要调整kernel.sched_autogroup_enabled参数以优化任务分组和资源分配。", "boottlneck_type": "system"} +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/optimize/strategy/system.jsonl b/copilot-tune/src/knowledge_base/optimize/strategy/system.jsonl +new file mode 100644 +index 0000000..d95cb4d +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/optimize/strategy/system.jsonl +@@ -0,0 +1,302 @@ ++[ ++ { ++ "策略名称": "CPU调频策略设置为 performance", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor", ++ "优化前默认值": "performance", ++ "策略是否可直接配置": "Y", ++ "使用风险": "", ++ "功能说明": "在performance模式下,CPU 将运行在最高频率,不会进行频率的动态调整,以满足性能需求。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "优化 CPU 空闲状态下的性能", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo Y > /sys/module/cpuidle_haltpoll/parameters/force\n或启动参数配置 haltpoll.enable=Y cpuidle-haltpoll.force=Y", ++ "优化前默认值": "N", ++ "策略是否可直接配置": "Y", ++ "使用风险": "5.X内核在ipi机制上变更导致上下文切换延时大大增加,通过使能haltpoll弥补其带来的性能损耗,但会增加功耗", ++ "功能说明": "haltpoll 允许 CPU 在处于空闲状态时执行一种新的待命状态,这种空闲状态称为 \"polling\"。在此状态下,CPU 不仅可以进入低功耗模式,还可以通过执行简单的循环或检查操作来快速响应中断和事件。\n启用 haltpoll,可以降低 CPU 响应中断的延迟。这对于实时应用或对延迟敏感的工作负载(如网络处理或高性能计算)非常有益,因为不会因进入深度睡眠状态而增加唤醒的时间。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "tuned低时延配置", ++ "对应瓶颈点": "NETWORK", ++ "优化步骤": "tuned-adm profile latency-performance\ntuned-adm profile network-latency\ntuned-adm profile realtime\ntuned-adm profile cpu-partitioning", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "低时延配置通常会禁用节能特性,以确保系统性能。这可能导致系统能耗增加", ++ "功能说明": "Tuned 是 Linux 提供的性能调优工具,它提供了一系列预定义的配置模板,可以快速优化系统性能。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "haltpoll", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo Y > /sys/module/cpuidle_haltpoll/parameters/force\n 或启动参数配置\nhaltpoll.enable=Y \ncpuidle-haltpoll.force=Y", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "主动轮询会占用更多的 CPU 资源,可能导致宿主机的其他虚拟机或进程得不到足够的 CPU 时间片", ++ "功能说明": "haltpoll是虚拟化环境中的一种低延迟优化技术,其核心思想是通过​​主动轮询替代被动等待​​来减少虚拟CPU(vCPU)的唤醒延迟:", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "禁用控制自动组调度特性", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo 0 > /proc/sys/kernel/sched_autogroup_enabled", ++ "优化前默认值": 0, ++ "策略是否可直接配置": "Y", ++ "使用风险": "若设置为1,有可能对context1、pipe子项影响较大", ++ "功能说明": "控制自动组调度特性(Auto Group Scheduling)的启用与禁用。这个特性主要用于改进桌面(GUI)环境中进程的调度,以提供更好的响应性。该配置项一般在服务器场景默认关闭。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "降低内核vm数据统计频率", ++ "对应瓶颈点": "MEMORY", ++ "优化步骤": "echo 10 > /proc/sys/vm/stat_interval", ++ "优化前默认值": 1, ++ "策略是否可直接配置": "Y", ++ "使用风险": "", ++ "功能说明": "设置虚拟内存系统统计信息更新的时间间隔。具体来说,它决定了内核收集关于虚拟内存和其他系统性能指标的统计数据的频率。这些数据通常用于监控和性能调优。\n设置更高周期的好处:\n1. 减少系统开销:通过将统计信息的更新间隔设为较长的时间,可以减少内核的上下文切换和资源消耗,从而让系统有更多的资源用于处理实际的业务负载。\n2. 平衡实时性与性能:较长的更新间隔可以减少干扰,这样系统负载可以更平稳,不会因频繁的统计更新而受到影响。\n3. 提高监控的有效性:对于一些长期运行的服务来说,统计信息的频繁更新可能会增加不必要的噪音,通过设置较长的间隔,可以更好地观察到系统的整体表现,而不是短期波动。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "程序进程绑核到同一个片上", ++ "对应瓶颈点": "MEMORY", ++ "优化步骤": "1. 运行程序前设置绑核:taskset -c 0-7 ./test_program\n2. 运行中进程设置绑核:taskset -c 0-7 ", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "仅针对个别子项有效,无法整体操作;需要依赖atune-runtime监控测试进程进行自动绑核", ++ "功能说明": "绑定进程到同一个 NUMA 节点内的核心,减少跨节点内存访问延迟,避免调度器迁移进程导致缓存失效。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "控制OpenMP程序中线程的 CPU 亲和性", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "export GOMP_CPU_AFFINITY=0-$(($(nproc) - 1));\nexport OMP_DYNAMIC=\"false\";\nexport OMP_THREAD_LIMIT=\"192\";\nexport OMP_SCHEDULE=\"static\"", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "", ++ "功能说明": "将线程均匀分配到所有可用的 CPU 核心(如 0~N-1,N 为系统总逻辑核心数),目的是减少线程迁移开销、提升缓存局部性,从而优化多核并行性能。适用于 GCC 的 OpenMP 实现(如 libgomp)。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "网卡中断绑核", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "systemctl stop irqbalance.service;\ncat /proc/interrupts | grep 网卡;\ncat /proc/irq/<中断号>/smp_affinity_list;\necho \"2-3\" > /proc/irq/<中断号>/smp_affinity_list;(2-3是绑定的cpu)", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "中断需要绑定到网卡所在NUMA节点的CPU核心上", ++ "功能说明": "网卡中断绑核是指将网卡的中断请求(IRQ)绑定到特定的CPU核心上,以优化系统的网络性能。在多核处理器系统中,这种优化可以减少CPU核心之间的上下文切换,提高中断处理效率,从而提升网络传输性能", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "用户态percpu缓存", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "1.绑定进程到特定 CPU 核心\ntaskset -c 0,1 ./application\n2.绑定内存到本地 NUMA 节点\nnumactl --cpunodebind=0 --membind=0 ./application\nnumactl --localalloc ./application(如果希望避免跨 NUMA 节点访问,可以使用 --localalloc)\n3.使用高性能内存分配器\nexport LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so或者\nexport LD_PRELOAD=/usr/lib/libtcmalloc.so\n./application\n4.禁用透明大页(THP),启用大页内存\necho never > /sys/kernel/mm/transparent_hugepage/enabled\necho never > /sys/kernel/mm/transparent_hugepage/defrag\necho 1024 > /proc/sys/vm/nr_hugepages", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "CPU缓存可能分配到非本地NUMA节点内存", ++ "功能说明": "per-CPU 缓存是高性能编程中常用的优化技术,通过在用户态为每个CPU核心维护独立的数据缓存,减少多线程竞争和缓存一致性协议(如MESI)带来的性能损耗。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "hbase关键进程绑核", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "taskset -c 0,1 ", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "跨NUMA节点访问内存导致性能下降", ++ "功能说明": "绑核优化(CPU Affinity)是提升HBase性能的重要手段,通过将关键进程绑定到特定CPU核心", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "用户态低时延调度", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "taskset -c 0,1 ./your_program\necho 1024 > /proc/sys/vm/nr_hugepages", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "优化可能导致系统资源(如CPU、内存)分配不均,引发不同任务或进程之间的资源竞争,从而影响整体性能", ++ "功能说明": "用户态低时延调度优化是一种针对高实时性需求场景的性能调优方法,旨在减少任务调度和执行的延迟。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "smt(调度时尽量使用本核,不用超线程的核)", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo KEEP_ON_CORE > /sys/kernel/debug/sched/features\necho 200 > /proc/sys/kernel/sched_util_ratio", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "多个线程同时访问共享资源可能导致资源竞争,降低系统性能", ++ "功能说明": "SMT通过让单个物理核心模拟多个逻辑核心来提高硬件利用率", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "geust_idle_poll特性 (hbase随机读)", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo Y > /sys/module/cpuidle_haltpoll/parameters/force\necho 400000 > /sys/module/haltpoll/parameters/guest_halt_poll_ns\n", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "如果轮询时间设置过大,可能会导致 CPU 使用率显著增加,尤其是在系统空闲时", ++ "功能说明": "geust_idle_poll特性是一种优化机制,用于减少虚拟机(Guest)中 vCPU 空闲时的上下文切换开销和任务唤醒延迟", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "clusteraware特性", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "1. 宏依赖:CONFIG_SCHED_CLUSTER\n2. 使能:echo 1 > /proc/sys/kernel/sched_cluster\n3. 去使能:echo 0 > /proc/sys/kernel/sched_cluster", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "在集群环境中,多个节点可能会竞争有限的资源(如CPU、内存、网络带宽等),导致性能瓶", ++ "功能说明": "Cluster-Aware(集群感知)特性是指软件或系统能够自动识别集群环境,并在多节点之间协调资源管理、故障恢复和数据同步,以确保高可用性(HA)、负载均衡和一致性。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "网卡中断绑核", ++ "对应瓶颈点": "MEMORY", ++ "优化步骤": "systemctl stop irqbalance.service\ncat /proc/interrupts | grep 网卡\ncat /proc/irq/<中断号>/smp_affinity_list\necho \"2-3\" > /proc/irq/<中断号>/smp_affinity_list(2-3是绑定的cpu)", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "中断需要绑定到网卡所在NUMA节点的CPU核心上", ++ "功能说明": "网卡中断绑核是指将网卡的中断请求(IRQ)绑定到特定的CPU核心上,以优化系统的网络性能。在多核处理器系统中,这种优化可以减少CPU核心之间的上下文切换,提高中断处理效率,从而提升网络传输性能", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "geust_idle_poll特性 ", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo Y > /sys/module/cpuidle_haltpoll/parameters/force\necho 400000 > /sys/module/haltpoll/parameters/guest_halt_poll_ns\n", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "如果轮询时间设置过大,可能会导致 CPU 使用率显著增加,尤其是在系统空闲时", ++ "功能说明": "geust_idle_poll特性是一种优化机制,用于减少虚拟机(Guest)中 vCPU 空闲时的上下文切换开销和任务唤醒延迟", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "arm:预取关闭,X86:预取开启", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "预取关闭:echo 0 > /sys/block//queue/read_ahead_kb\n预取开启:echo value > /sys/block//queue/read_ahead_kb(value是非零值)", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "预取的数据可能并不是程序实际需要的数据,导致缓存中存储了大量无用数据,从而将真正有用的数据挤出缓存,降低缓存命中率", ++ "功能说明": "预取是一种性能优化技术,用于提前将数据加载到内存中,以减少磁盘I/O延迟,从而提高数据访问速度。\nX86架构适合高性能计算,预取功能可以显著提升性能;而ARM架构注重低功耗和高效率,关闭预取功能可以更好地满足其设计目标", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "异步IO", ++ "对应瓶颈点": "DISK", ++ "优化步骤": "修改mysql配置文件:\n[mysqld]\ninnodb_use_native_aio = 1", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "异步写操作未完成时系统崩溃可能导致数据丢失", ++ "功能说明": "异步 I/O(Asynchronous I/O, AIO) 是一种重要的性能优化手段。它通过将 I/O 操作从主线程中解耦,减少阻塞和等待时间,从而提高数据库的整体吞吐量和响应速度。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "使用numad服务优化NUMA内存访问", ++ "对应瓶颈点": "MEMROY", ++ "优化步骤": "1. 安装numad包\n2. numad服务开启:systemctl start numad\n3. 配置 numad:根据需要修改 /etc/numad.conf 配置文件,以调整其行为。可以设置 NUMA 节点、内存使用阈值和其它参数。\n4. 绑定应用至 numad:在启动应用时,使用 numad 提供的 Mbind 特性,使应用程序能利用它的优化。可以通过以下命令行选项来实现:numad ", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "", ++ "功能说明": "numad 是一个用于 NUMA(非一致性存储访问)架构下的内存分配的服务,旨在通过动态调整和优化内存使用来提高系统性能。\n1、自动内存分配:numad 会监控系统的内存使用情况,并自动将进程或线程的内存绑定到最合适的 NUMA 节点,最大限度地减少远程内存访问。\n2、负载均衡:它能够分析系统上的负载,并在多个 NUMA 节点之间分配资源,以实现更好的性能和负载均衡。\n3、内存迁移:如果某个线程的内存块不再有效(例如,远程访问带来的延迟),numad 可以选择迁移该线程或进程的内存,以优化性能。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "numa绑核", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "taskset -c 0,1 mysqld", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "如果绑核配置不当,可能导致进程访问远程内存,从而增加内存访问延迟", ++ "功能说明": "在NUMA架构中,系统被划分为多个节点(NUMA节点),每个节点包含一个或多个CPU核心以及本地内存。CPU访问本地内存的速度比访问其他节点的内存(远程内存)要快得多。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "任务调度", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "echo STEAL > /sys/kernel/debug/sched_features\necho NO_SIS_UTIL > /sys/kernel/debug/sched_features\necho SIS_PROP > /sys/kernel/debug/sched_features", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "频繁的任务迁移可能导致 CPU 缓存失效,降低性能。", ++ "功能说明": "通过任务调度动态调整内核调度器特性,优化任务分配和资源利用,提升系统性能。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "代码段、malloc大页调优(仅针对4K页内核)", ++ "对应瓶颈点": "MEMROY", ++ "优化步骤": "# 禁用透明大页\necho never > /sys/kernel/mm/transparent_hugepage/enabled\necho never > /sys/kernel/mm/transparent_hugepage/defrag\n# 临时设置大页(重启后失效)\necho $huge_pages > /proc/sys/vm/nr_hugepages\n# 永久设置大页\necho \"vm.nr_hugepages = $huge_pages\" >> /etc/sysctl.conf\nsysctl -p", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "如果应用程序只需要少量内存,分配大页会导致内存浪费。", ++ "功能说明": "malloc 是标准 C 库提供的动态内存分配函数,默认情况下它使用操作系统的普通页(通常是 4KB)。在高性能场景下(例如 MySQL 数据库或其他需要频繁分配大块内存的应用),这种默认行为可能导致性能瓶颈。为了提升性能,可以通过配置和代码调整,使 malloc 使用 大页内存(Huge Pages) ,从而减少内存管理开销并提高访问效率。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "tcmalloc大页", ++ "对应瓶颈点": "MEMROY", ++ "优化步骤": "# 预留大页(例如预留1024个2MB大页)\necho 1024 > /proc/sys/vm/nr_hugepages\n# 通过环境变量启用大页支持\nexport TCMALLOC_USE_HUGE_PAGES=1\n# 指定大页阈值(默认2MB,可调整为1GB页)\nexport TCMALLOC_HUGE_PAGE_THRESHOLD=2097152 # 2MB\nLD_PRELOAD=\"/usr/lib/libtcmalloc.so\" ./your_program", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "大页分配导致内存浪费,以及跨NUMA节点访问大页性能下降", ++ "功能说明": "TCMalloc(Thread-Caching Malloc)是一种高性能的内存分配器,它通过多级缓存结构优化内存分配性能。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "6.6内核以上\n动态复合页代码段/so大页等等", ++ "对应瓶颈点": "MEMORY", ++ "优化步骤": "程序复合页:\necho always > /sys/kernel/mm/transparent_hugepage/hugepages-/enabled\necho 0x{xxx} > /sys/kernel/mm/transparent_hugepage/thp_exec_enabled\necho 4 > /sys/kernel/mm/transparent_hugepage/pcp_allow_high_order\necho 1 > /sys/kernel/mm/transparent_hugepage/thp_mapping_align\n文件复合页:\nmount -t ext4 -o buffered_iomap /dev/sda1 /mnt\nmount -t ext4 -o nobuffered_iomap /dev/sda1 /mnt\nUUID=xxx / ext4 buffered_iomap 0 0\nUUID=xxx / ext4 nobuffered_iomap 0 0\nmount -o remount,nobuffered_iomap /mnt\nmount -o remount,buffered_iomap /mnt", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "大页未充分利用导致内存浪费", ++ "功能说明": "动态复合页代码段/so大页,允许将多个常规页(通常4KB)​​动态组合​​成更大的复合页(如2MB),而无需预先分配大页内存", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "透明大页关闭", ++ "对应瓶颈点": "MEMORY", ++ "优化步骤": "echo never > /sys/kernel/mm/transparent_hugepage/enabled\necho never > /sys/kernel/mm/transparent_hugepage/defrag", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "关闭透明大页后,系统不再尝试将小页面合并为大页面,这可能会导致内存碎片化增加,尤其是在长时间运行的应用程序中", ++ "功能说明": "透明大页是 Linux 内核的一项内存管理优化技术,旨在自动使用大页(HugePages)来提升内存访问效率,减少 TLB(Translation Lookaside Buffer)缺失,从而提高系统性能。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "设置堆栈空间上限和用户进程内存上限", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "ulimit -s unlimited\nulimit -l 2097152", ++ "优化前默认值": "1. 8192\n2. 65536", ++ "策略是否可直接配置": "Y", ++ "使用风险": "621子项需要放开系统资源限制,并且子项使用openmp,需设置CPU亲和性,否则测试波动较大,设置CPU亲和性后测试结果较优", ++ "功能说明": "1. 对于一些需要大量堆栈空间的程序,例如复杂的递归算法或需要高内存使用的应用(如大数据处理或复杂计算),将堆栈大小设置为无限制,可以保证程序正常运行。\n2. 锁定内存的目的在于防止这些内存区域被交换到磁盘上。这对于实时系统或对性能要求极高的应用尤为重要,因为将数据交换到磁盘会涉及延迟,可能会影响程序的响应时间。\n3. 通过管理线程的亲和性,可以优化程序性能、提高资源利用率,减少上下文切换。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "卸载内核网络数据包过滤模块", ++ "对应瓶颈点": "CPU", ++ "优化步骤": "客户端卸载内核模块iptable_security,iptable_raw,iptable_nat,iptable_mangle", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "使用iptable命令查询相关表时,内核自动加载相关模块,额外引入热点ipt_do_table", ++ "功能说明": "内核模块 iptable_security、iptable_raw、iptable_nat 和 iptable_mangle 是 Linux 系统中用于网络流量控制和数据包处理的组成部分。\n1. iptable_security 模块为 Netfilter 提供更强的安全功能,主要用于访问控制策略的实施,可以根据包的安全上下文来决定包的处理方式。\n2. iptable_raw 模块允许不进行连接跟踪的原始数据包处理。通过使用这个模块,可以在数据包进入连接跟踪之前对它们进行过滤。\n3. iptable_nat 模块是用于实现网络地址转换(NAT)的,它能够在数据包经过时改变其源地址或目标地址。常用于实现路由器和防火墙功能。\n4. iptable_mangle 模块用于修改数据包的标头信息,包括 QoS(服务质量)标记、TTL(生存时间)等。它主要用于数据流量的修改和精细控制。\n这些内核模块可在网络流量控制、安全性和数据包处理方面提供增强功能,但也会引入一定的性能开销。\n去除后对UDP_STREAM可带来一定的性能提升。", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "网络参数", ++ "对应瓶颈点": "NETWORK", ++ "优化步骤": "sysctl -w net.ipv4.tcp_autocorking=0\nsysctl -w net.core.gro_normal_batch=1", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "禁用 TCP 自动填充可能会导致更多的小数据包发送,从而增加网络带宽的使用", ++ "功能说明": "TCP 自动填充(Autocorking)是一种优化机制,用于减少 TCP 数据包的发送次数,通过将多个小数据包合并为一个较大的数据包来提高网络效率。然而,在某些场景下,自动填充可能会导致不必要的延迟。将 tcp_autocorking 设置为 0 可以禁用这一特性,从而减少延迟,适用于对实时性要求较高的场景。\ngro_normal_batch 参数用于控制通用接收卸载(Generic Receive Offload, GRO)的批量处理数量。GRO 是一种网络优化技术,用于将多个小的数据包合并为一个较大的数据包,以减少处理开销。将 gro_normal_batch 设置为 1 可以减少批量处理的数量,从而降低延迟,提高网络响应速度\n", ++ "使能状态检查": "" ++ }, ++ { ++ "策略名称": "网络参数,Netfilter 相关表的优化\n", ++ "对应瓶颈点": "NETWORK", ++ "优化步骤": "echo 0 > /proc/sys/net/bridge/bridge-nf-call-arptables\necho 0 > /proc/sys/net/bridge/bridge-nf-call-iptables\necho 0 > /proc/sys/net/bridge/bridge-nf-call-ip6tables", ++ "优化前默认值": "", ++ "策略是否可直接配置": "Y", ++ "使用风险": "能导致某些安全策略(如防火墙规则)无法生效,从而增加安全风险", ++ "功能说明": "这些事是对 Netfilter 相关表的优化,具体来说是减少网桥(bridge)与 Netfilter 防火墙表(如 iptables、ip6tables 和 arptables)之间的交互。这种优化通常用于提高网络性能,尤其是在虚拟化和容器化环境中", ++ "使能状态检查": "" ++ } ++] +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/params/mysql_params.json b/copilot-tune/src/knowledge_base/params/mysql_params.json +new file mode 100644 +index 0000000..1381a53 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/params/mysql_params.json +@@ -0,0 +1,849 @@ ++[ ++ { ++ "name": "innodb_adaptive_flushing", ++ "info": { ++ "desc": "该参数指定是否根据工作负载动态调整InnoDB缓冲池中脏页的刷新速率。启用自适应刷新机制可以提高性能,并缓解磁盘I/O瓶颈。取值为ON时启用自适应刷新,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_adaptive_flushing_lwm", ++ "info": { ++ "desc": "定义了自适应刷新启用时的低水位线,表示重做日志容量的百分比。增大此值可以提高写入性能,但可能会增加恢复时间,能够缓解磁盘IO瓶颈。默认值为10。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 70 ++ ], ++ "dtype": "int", ++ "default_value": 10 ++ } ++ }, ++ { ++ "name": "innodb_adaptive_hash_index", ++ "info": { ++ "desc": "自适应哈希索引是一种机制,允许InnoDB根据查询模式动态创建哈希索引,以加速查询。它可以缓解CPU瓶颈,提升查询性能。该参数取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_adaptive_hash_index_parts", ++ "info": { ++ "desc": "InnoDB自适应哈希索引的分区数量。增大此值可以提高哈希索引的性能,但可能会增加内存使用。可以缓解内存瓶颈。默认值为8,最大值为512,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 512 ++ ], ++ "dtype": "int", ++ "default_value": 8 ++ } ++ }, ++ { ++ "name": "innodb_adaptive_max_sleep_delay", ++ "info": { ++ "desc": "该参数允许InnoDB根据当前工作负载自动调整innodb_thread_sleep_delay的值。它定义了InnoDB在自适应最大睡眠延迟期间的最大延迟时间(以微秒为单位),可以帮助缓解CPU瓶颈,尤其是在高并发情况下。增大该值可以使InnoDB在等待锁时更长时间地休眠,从而减少CPU的使用率,减小该值则会使InnoDB更快地尝试获取锁。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000000 ++ ], ++ "dtype": "int", ++ "default_value": 150000 ++ } ++ }, ++ { ++ "name": "innodb_api_bk_commit_interval", ++ "info": { ++ "desc": "该参数定义了InnoDB memcached接口中空闲连接的自动提交时间间隔(以秒为单位)。增大该值可以减少提交操作的频率,从而帮助缓解CPU瓶颈,减小该值则会增加提交操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1073741824 ++ ], ++ "dtype": "int", ++ "default_value": 5 ++ } ++ }, ++ { ++ "name": "innodb_api_disable_rowlock", ++ "info": { ++ "desc": "该参数用于禁用InnoDB API在执行DML操作时的行锁定。设置为ON时禁用行锁定,设置为OFF时启用行锁定。禁用行锁定可能会导致性能下降,但在某些情况下可以帮助缓解CPU瓶颈。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_chunk_size", ++ "info": { ++ "desc": "该参数定义了InnoDB缓冲池的每个块的大小(以字节为单位)。增大该值可以减少块的数量,从而减少管理开销,减小该值则会增加块的数量,可能会提高并发性能。该参数对于缓解内存瓶颈非常重要。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ "innodb_buffer_pool_size / innodb_buffer_pool_instances" ++ ], ++ "dtype": "int", ++ "default_value": 134217728 ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_debug", ++ "info": { ++ "desc": "启用此选项允许在缓冲池大小小于1GB时使用多个缓冲池实例,忽略对`innodb_buffer_pool_instances`施加的1GB最小缓冲池大小限制。该选项仅在使用`WITH_DEBUG` CMake选项编译时可用。该参数用于启用或禁用InnoDB缓冲池的调试信息,帮助开发人员和DBA分析内存使用情况,主要影响内存瓶颈。取值为ON时启用调试,OFF时禁用调试。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_dump_at_shutdown", ++ "info": { ++ "desc": "指示是否在关闭时转储缓冲池的内容。启用此选项可以提高下次启动时的性能。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF" ++ ], ++ "dtype": "string", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_dump_now", ++ "info": { ++ "desc": "该参数用于立即转储InnoDB缓冲池的内容到磁盘。它通常与参数innodb_buffer_pool_load_now结合使用,能够帮助缓解磁盘IO瓶颈,尤其是在重启后需要快速恢复数据时。取值为ON时表示立即转储,OFF时表示不转储。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "Boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_dump_pct", ++ "info": { ++ "desc": "该参数定义了在缓冲池转储时要转储的页面百分比。它指定了每个缓冲池中最近使用页面的百分比,以便在重启时更快地恢复数据。增大该值可以加快重启时的数据恢复速度,而减小该值则会减少转储的数据量,从而有助于缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "dtype": "Integer", ++ "default_value": 25 ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_filename", ++ "info": { ++ "desc": "指定缓冲池转储文件的名称,该文件保存由 `innodb_buffer_pool_dump_at_shutdown` 或 `innodb_buffer_pool_dump_now` 生成的表空间ID和页面ID列表。设置合适的文件名可以影响恢复性能,并有助于缓解磁盘IO瓶颈。建议使用默认值。", ++ "type": "discrete", ++ "range": [ ++ "ib_buffer_pool" ++ ], ++ "dtype": "string", ++ "default_value": "ib_buffer_pool" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_instances", ++ "info": { ++ "desc": "设置缓冲池实例的数量,允许多个缓冲池实例并行处理请求,以提高性能。可以缓解内存瓶颈。建议在缓冲池大小大于1GB时使用多个实例。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "Integer", ++ "default_value": 8 ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_in_core_file", ++ "info": { ++ "desc": "该参数指示是否将InnoDB缓冲池的内容写入核心转储文件。禁用该参数可以减少核心文件的大小,适用于故障排除。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "Boolean", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_load_at_startup", ++ "info": { ++ "desc": "指示在MySQL服务器启动时,InnoDB缓冲池是否自动加载之前持有的页面,以提高启动性能。取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF" ++ ], ++ "dtype": "string", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_load_now", ++ "info": { ++ "desc": "该参数用于立即加载InnoDB缓冲池的内容,帮助缓解磁盘IO瓶颈,尤其在重启后需要快速恢复数据时。取值为ON时表示立即加载,OFF时表示不加载。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_buffer_pool_size", ++ "info": { ++ "desc": "设置InnoDB缓冲池的大小,缓冲池用于缓存数据和索引,以提高性能。增大该值可以减少磁盘I/O,提高读取性能,建议设置为系统内存的70%-80%。这是InnoDB最重要的参数之一,直接影响数据库的性能和响应速度。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ { ++ "64-bit": "18446744073709551615 (2^64-1)", ++ "32-bit": "4294967295 (2^32-1)" ++ } ++ ], ++ "dtype": "int", ++ "default_value": 134217728 ++ } ++ }, ++ { ++ "name": "innodb_change_buffering", ++ "info": { ++ "desc": "该参数控制InnoDB如何缓冲对辅助索引的更改,优化写入操作,能够缓解磁盘I/O瓶颈。可选值包括:none(禁用)、inserts(仅插入)、deletes(仅删除)、changes(更改)、purges(清除)和all(全部)。", ++ "type": "discrete", ++ "range": [ ++ "none", ++ "inserts", ++ "deletes", ++ "changes", ++ "purges", ++ "all" ++ ], ++ "dtype": "string", ++ "default_value": "all" ++ } ++ }, ++ { ++ "name": "innodb_change_buffer_max_size", ++ "info": { ++ "desc": "该参数定义了InnoDB更改缓冲区的最大大小(以百分比表示),作为缓冲池总大小的一个百分比。增大该值可以提高写入性能,帮助缓解磁盘IO瓶颈,而减小该值则会限制更改缓冲区的大小,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 50 ++ ], ++ "dtype": "int", ++ "default_value": 25 ++ } ++ }, ++ { ++ "name": "innodb_checkpoint_disabled", ++ "info": { ++ "desc": "这是一个调试选项,用于禁用检查点机制,以强制在服务器退出时进行恢复。禁用检查点可能会导致性能下降,但在某些情况下可以提高写入性能,并缓解磁盘IO瓶颈。取值为ON时禁用,OFF时启用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "string", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_checksum_algorithm", ++ "info": { ++ "desc": "指定如何生成和验证存储在InnoDB表空间磁盘块中的校验和,以确保数据完整性。可以缓解CPU瓶颈。可选值包括crc32、strict_crc32、innodb、strict_innodb、none和strict_none,建议使用crc32以获得较好的性能和安全性。", ++ "type": "discrete", ++ "range": [ ++ "crc32", ++ "strict_crc32", ++ "innodb", ++ "strict_innodb", ++ "none", ++ "strict_none" ++ ], ++ "dtype": "string", ++ "default_value": "crc32" ++ } ++ }, ++ { ++ "name": "innodb_commit_concurrency", ++ "info": { ++ "desc": "该参数控制同时提交的事务数量,允许多个线程并发提交以提高性能。适当调整此参数可以缓解CPU瓶颈,建议根据系统负载进行调整,通常设置为8或16。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ } ++ }, ++ { ++ "name": "innodb_compression_level", ++ "info": { ++ "desc": "指定用于InnoDB压缩表和索引的zlib压缩级别。该参数控制数据压缩的强度,可以缓解磁盘IO瓶颈。取值范围为0(无压缩)到9(最大压缩),增大该值会增加CPU使用率,但可以减少磁盘空间的使用。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 9 ++ ], ++ "dtype": "int", ++ "default_value": 6 ++ } ++ }, ++ { ++ "name": "innodb_compression_pad_pct_max", ++ "info": { ++ "desc": "该参数定义了InnoDB压缩填充的最大百分比。它指定每个压缩页面中可以保留的最大空闲空间百分比。增大该值可以提高压缩效率,减小该值则可能会导致压缩效率下降,从而帮助缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 75 ++ ], ++ "dtype": "int", ++ "default_value": 50 ++ } ++ }, ++ { ++ "name": "innodb_concurrency_tickets", ++ "info": { ++ "desc": "该参数定义了InnoDB的并发票据数,决定了可以同时进入InnoDB的线程数量。增大该值可以提高并发性能,帮助缓解CPU瓶颈,而减小该值则可能导致性能下降。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 5000 ++ } ++ }, ++ { ++ "name": "innodb_ddl_threads", ++ "info": { ++ "desc": "该参数定义了InnoDB在执行DDL操作时可以使用的最大并行线程数。增大该值可以提高DDL操作的并发性,帮助缓解CPU瓶颈,而减小该值则可能导致性能下降。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ } ++ }, ++ { ++ "name": "innodb_default_row_format", ++ "info": { ++ "desc": "该参数定义了InnoDB表和用户创建的临时表的默认行格式。选择合适的行格式可以优化存储和性能,帮助缓解磁盘IO瓶颈。可选的行格式包括DYNAMIC、COMPRESSED、REDUNDANT和COMPACT。", ++ "type": "discrete", ++ "range": [ ++ "REDUNDANT", ++ "COMPACT", ++ "DYNAMIC", ++ "COMPRESSED" ++ ], ++ "dtype": "enumeration", ++ "default_value": "DYNAMIC" ++ } ++ }, ++ { ++ "name": "innodb_disable_sort_file_cache", ++ "info": { ++ "desc": "该参数用于禁用操作系统文件系统缓存,以便在进行合并排序的临时文件时提高性能。启用该参数(值为ON)可以帮助缓解磁盘IO瓶颈,而禁用缓存(值为OFF)可能会导致性能下降。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_doublewrite", ++ "info": { ++ "desc": "控制双写缓冲机制,以提高数据安全性。可以缓解磁盘IO瓶颈。MySQL 8.0.30及以上版本支持多种设置,取值为ON时启用,OFF时禁用,DETECT_AND_RECOVER和DETECT_ONLY用于检测和恢复。建议保持启用以防止数据丢失。", ++ "type": "discrete", ++ "range": [ ++ "ON", ++ "OFF", ++ "DETECT_AND_RECOVER", ++ "DETECT_ONLY" ++ ], ++ "dtype": "enumeration", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_fast_shutdown", ++ "info": { ++ "desc": "控制InnoDB的快速关闭行为,影响关闭时的性能和数据安全性。可以缓解磁盘IO瓶颈。取值为0(完全关闭)、1(快速关闭)、2(最小关闭),建议使用1以平衡性能和安全性。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 1, ++ 2 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ } ++ }, ++ { ++ "name": "innodb_file_per_table", ++ "info": { ++ "desc": "启用每个表使用独立的表空间文件,以提高管理灵活性和性能,能够缓解磁盘IO瓶颈。当取值为ON时启用,OFF时禁用。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_flushing_avg_loops", ++ "info": { ++ "desc": "该参数定义了InnoDB在平均循环中进行刷新操作的次数。它控制InnoDB保持之前计算的刷新状态快照的迭代次数,从而影响自适应刷新对变化工作负载的响应速度。增大该值可以减少刷新操作的频率,可能有助于缓解磁盘IO瓶颈,而减小该值则会增加刷新操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 30 ++ } ++ }, ++ { ++ "name": "innodb_flush_log_at_trx_commit", ++ "info": { ++ "desc": "控制事务提交时日志的刷新策略,影响数据安全性和性能。取值为0(每秒刷新一次)、1(每次提交刷新)、2(每次提交写入内存)。建议根据数据安全性需求进行选择,适当调整可以缓解磁盘IO瓶颈。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 1, ++ 2 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ } ++ }, ++ { ++ "name": "innodb_flush_method", ++ "info": { ++ "desc": "定义用于将数据刷新到InnoDB数据文件和日志文件的方法,这会影响I/O吞吐量。可以缓解磁盘I/O瓶颈。可选值包括fsync、O_DSYNC、littlesync、nosync、O_DIRECT、O_DIRECT_NO_FSYNC、unbuffered和normal。建议根据存储设备选择合适的刷新方法。", ++ "type": "discrete", ++ "range": [ ++ "fsync", ++ "O_DSYNC", ++ "littlesync", ++ "nosync", ++ "O_DIRECT", ++ "O_DIRECT_NO_FSYNC", ++ "unbuffered", ++ "normal" ++ ], ++ "dtype": "String", ++ "default_value": [ ++ "fsync", ++ "unbuffered" ++ ] ++ } ++ }, ++ { ++ "name": "innodb_force_recovery", ++ "info": { ++ "desc": "该参数用于设置InnoDB的强制恢复模式,通常在严重故障排除情况下更改。它可以帮助在崩溃后恢复数据,主要影响磁盘IO瓶颈。取值范围为0到6,其中0表示正常模式,1到6表示不同级别的恢复模式,数值越大,恢复的限制越多。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 6 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ } ++ }, ++ { ++ "name": "innodb_ft_max_token_size", ++ "info": { ++ "desc": "该参数定义了InnoDB全文索引中最大令牌大小。它决定了可以被索引的单词的最大字符长度。增大该值可以允许更长的单词被索引,从而可能帮助缓解CPU瓶颈,减小该值则会限制索引的单词长度。", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 84 ++ ], ++ "dtype": "int", ++ "default_value": 84 ++ } ++ }, ++ { ++ "name": "innodb_idle_flush_pct", ++ "info": { ++ "desc": "该参数限制了InnoDB在空闲时的页面刷新百分比。增大该值可以提高空闲时的刷新频率,从而帮助缓解磁盘IO瓶颈;减小该值则会减少空闲时的刷新频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 100 ++ } ++ }, ++ { ++ "name": "innodb_io_capacity", ++ "info": { ++ "desc": "innodb_io_capacity变量定义了InnoDB后台任务可用的每秒I/O操作次数(IOPS)。该参数影响缓冲区刷新脏页的速率,建议根据存储设备的性能进行调整,通常设置为SAS 200,SSD 5000,PCI-E 10000-50000。", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 200 ++ } ++ }, ++ { ++ "name": "innodb_io_capacity_max", ++ "info": { ++ "desc": "innodb_io_capacity_max变量定义了InnoDB后台任务执行的最大I/O操作次数(IOPS)。该参数设置InnoDB的最大I/O容量,影响高负载情况下的I/O速率,可以缓解磁盘I/O瓶颈。建议根据存储设备的性能进行调整,通常设置为1000-20000。", ++ "type": "continuous", ++ "range": [ ++ 100, ++ "18446744073709551615" ++ ], ++ "dtype": "int", ++ "default_value": "2 * innodb_io_capacity, min of 2000" ++ } ++ }, ++ { ++ "name": "innodb_lock_wait_timeout", ++ "info": { ++ "desc": "InnoDB事务在放弃之前等待行锁的时间(以秒为单位)。默认值为50秒。该参数影响事务的锁等待行为,可以缓解CPU瓶颈,建议根据应用需求进行调整。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1073741824 ++ ], ++ "dtype": "int", ++ "default_value": 50 ++ } ++ }, ++ { ++ "name": "innodb_log_buffer_size", ++ "info": { ++ "desc": "InnoDB日志缓冲区的大小,单位为字节。增大此值可以提高写入性能,尤其是在高事务负载的情况下,能够缓解磁盘IO瓶颈。默认值为16MB,最小值为1KB,最大值为4GB。", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ } ++ }, ++ { ++ "name": "innodb_log_files_in_group", ++ "info": { ++ "desc": "InnoDB日志文件组中的日志文件数量。增加此值可以提高并发写入性能,并缓解磁盘IO瓶颈。默认值为2,最小值为2,最大值为100。", ++ "type": "discrete", ++ "range": [ ++ 2, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 2 ++ } ++ }, ++ { ++ "name": "innodb_log_file_size", ++ "info": { ++ "desc": "每个InnoDB日志文件的大小。增大此值可以提高恢复性能,尤其是在大事务的情况下。可以缓解磁盘IO瓶颈。默认值为48MB,最大值为512GB,最小值为1MB。", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ "512GB / innodb_log_files_in_group" ++ ], ++ "dtype": "int", ++ "default_value": 50331648 ++ } ++ }, ++ { ++ "name": "innodb_max_dirty_pages_pct", ++ "info": { ++ "desc": "InnoDB允许的最大脏页百分比。该参数控制InnoDB在缓冲池中允许的脏页的比例,默认值为90%。增大此值可以减少写入操作,但可能会增加恢复时间。建议在内存分配过大导致Swap占用严重时适当减小此值,以释放Swap空间。过大的值会导致每次更新需要交换的数据页过多,而过小的值则可能导致更新操作变慢。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int", ++ "default_value": 90 ++ } ++ }, ++ { ++ "name": "innodb_max_dirty_pages_pct_lwm", ++ "info": { ++ "desc": "该参数定义了InnoDB允许的最小脏页百分比,表示在启用预刷写以控制脏页比例时的低水位线。增大该值可以减少写入操作的频率,减小该值则会增加写入操作的频率,可能会影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 99.999 ++ ], ++ "dtype": "Numeric", ++ "default_value": 10 ++ } ++ }, ++ { ++ "name": "innodb_max_purge_lag", ++ "info": { ++ "desc": "定义了期望的最大清除延迟。如果超过此值,将对INSERT、UPDATE和DELETE操作施加延迟。增大此值可以减少清除操作的频率,但可能会导致内存使用增加,从而缓解内存瓶颈。默认值为0,最大值为4294967295,最小值为0。", ++ "type": "discrete", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "Integer", ++ "default_value": 0 ++ } ++ }, ++ { ++ "name": "innodb_numa_interleave", ++ "info": { ++ "desc": "启用NUMA内存交错。启用后可以提高多核系统的内存访问性能,缓解内存瓶颈。取值为ON/OFF,默认值为OFF。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "Boolean", ++ "default_value": "OFF" ++ } ++ }, ++ { ++ "name": "innodb_old_blocks_pct", ++ "info": { ++ "desc": "指定InnoDB缓冲池中用于旧块子列表的近似百分比。增大此值可以增加旧块的使用,可能会提高查询性能,并缓解内存瓶颈。默认值为37,最大值为95,最小值为5。", ++ "type": "continuous", ++ "range": [ ++ 5, ++ 95 ++ ], ++ "dtype": "int", ++ "default_value": 37 ++ } ++ }, ++ { ++ "name": "innodb_old_blocks_time", ++ "info": { ++ "desc": "指定一个块在被插入到旧子列表后,首次访问后必须保持在该子列表中的时间(以毫秒为单位),在此时间到达后,该块可以被移动到新子列表。增大此值可以增加旧块的保留时间,可能会提高查询性能,并缓解内存瓶颈。默认值为1000毫秒。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 1000 ++ } ++ }, ++ { ++ "name": "innodb_open_files", ++ "info": { ++ "desc": "指定InnoDB可以同时打开的最大文件数。增大此值可以提高并发性能,尤其是在高负载情况下,并且可以缓解内存瓶颈。默认值为300。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 65535 ++ ], ++ "dtype": "int", ++ "default_value": 300 ++ } ++ }, ++ { ++ "name": "innodb_page_size", ++ "info": { ++ "desc": "指定InnoDB表空间的页面大小,影响存储效率和性能。可以缓解磁盘I/O瓶颈。可选值为4096、8192、16384、32768和65536字节。", ++ "type": "discrete", ++ "range": [ ++ 4096, ++ 8192, ++ 16384, ++ 32768, ++ 65536 ++ ], ++ "dtype": "int", ++ "default_value": 16384 ++ } ++ }, ++ { ++ "name": "innodb_parallel_read_threads", ++ "info": { ++ "desc": "定义可以用于并行聚簇索引读取的线程数量。增大此值可以提高读取性能,尤其是在多核系统上,能够缓解CPU瓶颈。默认值为4,最小值为1,最大值为256。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 256 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ } ++ }, ++ { ++ "name": "innodb_read_ahead_threshold", ++ "info": { ++ "desc": "控制InnoDB使用的线性预读的敏感性,以便将页面预取到缓冲池中。增大此值可以提高顺序读取性能,但可能会增加随机读取的延迟,从而缓解磁盘IO瓶颈。默认值为56,最大值为64,最小值为0。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 56 ++ } ++ }, ++ { ++ "name": "innodb_read_io_threads", ++ "info": { ++ "desc": "InnoDB读取操作的I/O线程数量。增大此值可以提高读取性能,尤其是在高并发情况下,能够缓解磁盘I/O瓶颈。默认值为4,最大值为64,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ } ++ }, ++ { ++ "name": "innodb_sync_array_size", ++ "info": { ++ "desc": "定义了互斥锁/锁等待数组的大小。增大此值可以提高高并发工作负载下的线程协调能力,尤其是在等待线程数量较多的情况下。此设置必须在MySQL实例启动时配置,之后无法更改。增大此值可以提高写入性能,尤其是在高负载情况下,能够缓解磁盘IO瓶颈。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 1024 ++ ], ++ "dtype": "int", ++ "default_value": 1 ++ } ++ }, ++ { ++ "name": "innodb_table_locks", ++ "info": { ++ "desc": "如果 `autocommit = 0`,`InnoDB` 会遵循 `LOCK TABLES`;MySQL 不会在 `LOCK TABLES ... WRITE` 之前返回,直到所有其他线程释放了对该表的所有锁。启用或禁用InnoDB表锁。启用后可以提高并发性能,缓解CPU瓶颈。取值为ON/OFF,默认值为ON。", ++ "type": "discrete", ++ "range": [ ++ "OFF", ++ "ON" ++ ], ++ "dtype": "boolean", ++ "default_value": "ON" ++ } ++ }, ++ { ++ "name": "innodb_thread_concurrency", ++ "info": { ++ "desc": "该参数定义了允许进入InnoDB的最大线程数。值为0(默认值)表示无限并发(无上限)。在高并发系统中,此变量用于性能调优。推荐设置为服务器CPU核心数的2倍,以优化并发操作。适当的设置可以提高CPU多核处理能力和并发量,但过高的值可能导致上下文切换增加,从而影响性能。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 1000 ++ ], ++ "dtype": "int", ++ "default_value": 0 ++ } ++ }, ++ { ++ "name": "innodb_thread_sleep_delay", ++ "info": { ++ "desc": "InnoDB线程休眠延迟,表示InnoDB线程在加入InnoDB队列之前的休眠时间,单位为微秒。增大此值可以减少CPU使用率,但可能会影响响应时间,适用于缓解CPU瓶颈。默认值为10000微秒,最大值为4294967295微秒,最小值为0微秒。", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4294967295 ++ ], ++ "dtype": "int", ++ "default_value": 10000 ++ } ++ }, ++ { ++ "name": "innodb_write_io_threads", ++ "info": { ++ "desc": "InnoDB写入操作的I/O线程数量。增大此值可以提高写入性能,尤其是在高并发情况下,能够缓解磁盘I/O瓶颈。默认值为4,最大值为64,最小值为1。", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 64 ++ ], ++ "dtype": "int", ++ "default_value": 4 ++ } ++ }, ++ { ++ "name": "max_heap_table_size", ++ "info": { ++ "desc": "This variable sets the maximum size to which user-created MEMORY tables are permitted to grow. It can reduce disk I/O by keeping temporary tables in memory.", ++ "type": "continuous", ++ "range": [ ++ 16384, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ } ++ }, ++ { ++ "name": "thread_cache_size", ++ "info": { ++ "desc": "The number of threads cached to handle new connections. This variable can be increased to improve performance if you have a lot of new connections. It reduces CPU overhead from thread creation/destruction. Ideally set high enough so that most new connections use cached threads, but higher values consume more memory.", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 16384 ++ ], ++ "dtype": "int", ++ "default_value": 9 ++ } ++ }, ++ { ++ "name": "tmp_table_size", ++ "info": { ++ "desc": "Defines the maximum size of internal in-memory temporary tables created by the MEMORY storage engine and, as of MySQL 8.0.28, the TempTable storage engine. If the size of a temporary table exceeds this value, it is converted to a disk-based table, which can alleviate memory and disk I/O. Higher values reduce disk usage but increase memory consumption.", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 18446744073709551615 ++ ], ++ "dtype": "int", ++ "default_value": 16777216 ++ } ++ } ++] +\ No newline at end of file +diff --git a/copilot-tune/src/knowledge_base/params/system_params.json b/copilot-tune/src/knowledge_base/params/system_params.json +new file mode 100644 +index 0000000..0cd10e9 +--- /dev/null ++++ b/copilot-tune/src/knowledge_base/params/system_params.json +@@ -0,0 +1,1935 @@ ++[ ++ { ++ "name": "kernel.sched_migration_cost_ns", ++ "info": { ++ "desc": "1. 在高负载多核系统中,若观察到频繁的进程迁移导致性能下降,可适当增加该值以减少不必要的迁移,典型调整范围为100000-500000纳秒\n\n2. 在低延迟敏感型应用中,若发现缓存局部性不佳导致性能下降,可适当降低该值以促进热进程迁移,典型调整范围为50000-200000纳秒", ++ "type": "continuous", ++ "range": [ ++ 100000, ++ 5000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_cfs_bandwidth_slice_us", ++ "info": { ++ "desc": "1. 当需要更精细控制CFS带宽分配时(如高负载容器环境),可适当减小该值(默认5000微秒),但需注意过小会增加调度开销\n2. 在CPU资源充足且需要减少调度开销的场景下,可增大该值以减少全局时间池的分配频率", ++ "type": "continuous", ++ "range": [ ++ 1000, ++ 50000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_wakeup_granularity_ns", ++ "info": { ++ "desc": "1. 对于延迟敏感型应用(如实时任务或高频交易系统),若观测到任务切换过于频繁导致性能下降,可适当增大该值(如5000000-10000000 ns),减少不必要的抢占 \n\n2. 在CPU密集型负载场景下,若调度器日志显示进程频繁被短时间抢占(如<1000000 ns),且存在吞吐量下降现象,可尝试降低该值(如1000000-2000000 ns)以提高响应速度", ++ "type": "continuous", ++ "range": [ ++ 1000000, ++ 100000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_latency_ns", ++ "info": { ++ "desc": "1. 当系统运行高优先级实时任务时,若出现调度延迟过高的情况,可适当减小该值以提高调度响应速度\n\n2. 对于CPU密集型负载且任务数量较多(超过8个逻辑CPU)的系统,应增大该值以减少上下文切换开销", ++ "type": "continuous", ++ "range": [ ++ 1000000, ++ 100000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_nr_migrate", ++ "info": { ++ "desc": "1. 当系统出现频繁的进程迁移导致性能下降时,可适当降低该值以减少迁移开销,建议从默认值32调整为16-24范围测试\n2. 对于NUMA架构服务器且进程具有较强CPU亲和性需求时,可适当提高该值至64以增强负载均衡能力", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 128 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_min_granularity_ns", ++ "info": { ++ "desc": "1. 当系统负载较高且存在大量短时间运行的进程时,可以适当增大该值以减少上下文切换开销,建议从默认值(通常为1,000,000 ns)逐步增加测试,观察性能变化\n\n2. 对于CPU密集型工作负载且进程运行时间普遍较长的情况,可以适当减小该值以提高系统响应能力,建议从默认值逐步减少测试", ++ "type": "continuous", ++ "range": [ ++ 1000000, ++ 100000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sched_tunable_scaling", ++ "info": { ++ "desc": "1. 当系统CPU核心数较多(如32核以上)且存在调度延迟敏感型负载时,建议设置为2(线性比例调整),以优化多核环境下的调度粒度平衡\n\n2. 在低核心数服务器(如8核以下)或需要固定调度参数的实时任务场景中,建议设置为0(不调整),避免动态调整引入不可预测的延迟", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "transparent_hugepage.defrag", ++ "info": { ++ "desc": "1. 对于需要低延迟的应用(如数据库、实时系统),建议禁用该参数(设置为never),以避免因内存碎片整理导致的性能波动\n2. 对于内存密集型且对延迟不敏感的应用(如批处理作业),建议启用该参数(设置为always或defer+madvise),以提高大内存页使用率减少TLB缺失", ++ "range": [ ++ "always", ++ "defer", ++ "defer+madvise", ++ "madvise", ++ "never" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "transparent_hugepage.enabled", ++ "info": { ++ "desc": "1. 对于延迟敏感型应用(如数据库、实时系统),建议禁用(设置为never或madvise),以避免因透明大页碎片整理导致的不可预测延迟\n\n2. 对于内存密集型批处理作业(如科学计算、大数据处理),建议启用(设置为always),以通过减少页表项提升内存访问效率", ++ "range": [ ++ "always", ++ "madvise", ++ "never" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.netfilter.nf_conntrack_max", ++ "info": { ++ "desc": "1. 当服务器处理大量并发连接(如超过默认值65536)时,若出现\"table full\"相关内核日志或连接跟踪表频繁满导致丢包,应增加该值(通常设置为总内存MB数/16384,如8GB内存可设为524288)\n\n2. 在高并发短连接场景下,若nf_conntrack_count经常接近nf_conntrack_max值,应结合连接跟踪超时时间(nf_conntrack_tcp_timeout_*系列参数)一同调整,避免过早占满跟踪表", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 67108864 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.mtu", ++ "info": { ++ "desc": "1. 当网络传输中出现大量分片或性能下降时,建议测试并调整MTU值以匹配实际网络环境(通常1500是以太网默认值,但需考虑VPN/隧道开销) \n\n2. 在高速网络(如10Gbps+)或特定拓扑(如RDMA)中,可尝试增大MTU(如9000)以提升吞吐量,但需确保全网设备支持Jumbo Frame", ++ "type": "continuous", ++ "range": [ ++ 500, ++ 9000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.tx-frames", ++ "info": { ++ "desc": "在高吞吐量网络环境下(如10Gbps以上),若观察到tx_dropped或tx_errors计数器持续增长,可适当增加该值至64-256范围以提升批量发包效率 \n\n当CPU软中断(如NET_TX)占用率过高且网络延迟敏感时,应降低该值至32以下以减少单次中断处理的数据包数量", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.rx-frames", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境中,当观察到`/proc/interrupts`显示单个CPU核心处理大量网络中断导致软中断(si)占用过高时,应适当增加该值以减少中断频率,典型调整范围为64-256 \n\n2. 在低延迟应用场景中,若网络延迟敏感型应用(如金融交易系统)出现延迟抖动,且`netstat -s`显示报文处理延迟增大,可尝试降低该值至32以下以加快中断响应", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.tx-usecs", ++ "info": { ++ "desc": "1. 当网络传输延迟敏感型应用(如高频交易、实时视频流)出现延迟问题时,可尝试减小该值以减少中断延迟,建议从默认值100us逐步下调测试(如50us、25us),需配合性能监控避免过度降低导致CPU占用上升 \n2. 在CPU利用率过高且网络吞吐量未达瓶颈时,可适当增大该值(如200us)以减少中断频率,但需确保不会引入明显的延迟增加", ++ "type": "continuous", ++ "range": [ ++ 2, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.rx-usecs", ++ "info": { ++ "desc": "在高吞吐量网络环境下(如10Gbps以上),如果/proc/interrupts显示单个CPU核心处理过多RX中断,建议增加该值(如50-100微秒)以减少中断频率 \n\n当系统CPU利用率过高且/proc/net/softnet_stat显示drop计数增长时,可适当降低该值(如20-30微秒)以加快中断响应", ++ "type": "continuous", ++ "range": [ ++ 2, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.combined", ++ "info": { ++ "desc": "1. 当服务器处理高网络吞吐量(如10Gbps以上)且CPU出现软中断(softirq)不均时,应增加队列数量至与物理CPU核心数匹配,通常每个NUMA节点分配4-8个队列 \n\n2. 在虚拟化环境中当出现网络延迟波动时,需检查队列长度是否过短(默认256),建议结合网卡硬件能力调整为512-2048以缓冲突发流量", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 32 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.adaptive-rx", ++ "info": { ++ "desc": "1. 当网络接口接收高吞吐量小包时出现高CPU中断负载,应启用该参数以自动优化中断合并策略\n2. 在低延迟要求的场景(如高频交易)中若观察到网络延迟增加,应禁用该参数以避免自适应调整引入的延迟波动", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.adaptive-tx", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境中(如10Gbps以上),当系统监控显示大量中断导致CPU使用率过高时,建议启用该参数以降低中断频率\n\n2. 在低延迟敏感型应用场景(如高频交易系统),若网络延迟指标出现异常波动,建议禁用该参数以确保中断响应及时性", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.tx-ring buffer size", ++ "info": { ++ "desc": "1. 当网络吞吐量高且出现数据包丢失或延迟增加时,可以适当增大该值以减少发送队列溢出风险,建议从默认值(通常256-1024)逐步倍增测试,最大值不超过65535\n\n2. 在低吞吐量或低配置虚拟机环境中,若系统监控显示发送队列长期空闲(利用率<30%),可适当减小该值以释放内存资源,建议不低于64", ++ "type": "continuous", ++ "range": [ ++ 256, ++ 16384 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.rx-ring buffer size", ++ "info": { ++ "desc": "1. 当网络流量高峰期出现丢包或延迟增加时,可适当增大该值以提升吞吐量,但需注意内存消耗增加的风险\n2. 当系统内存资源紧张且网络负载较低时,可适当减小该值以释放内存资源,但需确保不会导致性能下降", ++ "type": "continuous", ++ "range": [ ++ 256, ++ 16384 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.generic-receive-offload", ++ "info": { ++ "desc": "1. 当服务器处理大量小包且CPU利用率过高时,建议禁用该参数以降低CPU开销\n2. 当服务器主要处理大流量数据且网络吞吐量不足时,建议启用该参数以减少协议栈处理开销", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.generic-segmentation-offload", ++ "info": { ++ "desc": "1. 当服务器主要处理大量小数据包(如DNS、VoIP等应用)且CPU利用率过高时,建议禁用该参数以降低CPU处理开销\n2. 当服务器主要传输大文件或视频流等大数据量应用时,建议启用该参数以提高网络吞吐量", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.tcp-segmentation-offload", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境下(如10Gbps+),当CPU利用率因TCP分段处理过高时,建议启用该参数以减轻CPU负载\n2. 当遇到网络数据包校验错误或分片异常问题时,建议临时禁用该参数进行故障排查", ++ "range": [ ++ "on", ++ "off" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.pid_max", ++ "info": { ++ "desc": "- 当系统频繁达到当前pid_max限制导致无法创建新进程时,应适当增大该值,通常可设置为默认值(32768)的2-4倍\n- 在容器化环境中若需支持大量短生命周期进程,建议将pid_max提升至262144(2^18)以匹配现代Linux内核支持的上限", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 4194304 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.shmmni", ++ "info": { ++ "desc": "- 当运行需要大量共享内存段的数据库(如Oracle)或科学计算应用时,若出现\"SHMMNI\"相关错误日志,应增加该值至超过应用实际需求的20%冗余量\n\n- 在容器化或虚拟化环境中,若单个物理节点需承载多个共享内存密集型实例,应按实例数乘以单个实例需求量的1.5倍进行设置", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 16384 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.shmmax", ++ "info": { ++ "desc": "1. 当运行需要大量共享内存的应用(如Oracle数据库、SAP HANA等)时,如果应用报错提示共享内存不足,需要将kernel.shmmax设置为至少等于所有共享内存段总和的80%-90%,但不超过物理内存的90%\n\n2. 在容器化或虚拟化环境中,当多个实例需要共享内存通信且出现性能瓶颈时,应根据每个实例的实际共享内存需求总和来调整kernel.shmmax,确保其值大于所有实例需求之和", ++ "type": "continuous", ++ "range": [ ++ 17179869184, ++ 68719476736 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.shmall", ++ "info": { ++ "desc": "1. 当系统运行需要大量共享内存的应用(如Oracle数据库)且出现\"SHMMAX too small\"错误时,需要增加该值至物理内存的80%左右\n\n2. 当系统频繁使用共享内存但未充分利用物理内存时,可适当降低该值以避免资源浪费,通常设置为(总物理内存 - 系统保留内存) / PAGE_SIZE", ++ "type": "continuous", ++ "range": [ ++ 1073741824, ++ 8589934592 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.core_uses_pid", ++ "info": { ++ "desc": "1. 当需要快速定位崩溃进程时,建议启用该参数(设为1),通过core文件名中的PID可以快速关联到具体进程信息\n\n2. 当系统频繁产生core文件且磁盘空间紧张时,建议禁用该参数(设为0),避免文件名过长导致管理困难", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.msgmni", ++ "info": { ++ "desc": "1. 当系统日志频繁出现\"msgmni limit reached\"错误时,表明当前队列数量不足以支撑应用需求,需要增加该值\n\n2. 对于频繁使用System V消息队列的中间件应用(如Oracle数据库),建议将该值设置为进程数量的4倍以上", ++ "type": "continuous", ++ "range": [ ++ 8000, ++ 128000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.msgmax", ++ "info": { ++ "desc": "1. 当应用频繁发送超过当前 kernel.msgmax 限制的大消息导致消息队列操作失败时,应适当增大该值,但需确保不超过系统可用内存的合理比例\n\n2. 若系统存在大量小消息传输且 msgmax 设置过大导致内存碎片化,应降低该值以匹配实际消息大小,通常不低于 8KB", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.msgmnb", ++ "info": { ++ "desc": "增加该值当消息队列频繁达到默认上限(通常为16384字节)导致应用报错时 \n降低该值当系统存在大量闲置消息队列且需要减少内核内存占用时", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sem", ++ "info": { ++ "desc": "1. 当系统出现大量进程因信号量资源不足而阻塞时,需要增加semmni和semmsl值,建议将semmni设置为至少等于并发进程数,semmsl设置为每个进程可能需要的最大信号量数\n\n2. 当系统出现semop调用频繁失败或性能下降时,需要增大semopm值,建议将其设置为典型事务中所需的信号量操作数的2-3倍", ++ "range": [ ++ "16000 512000000 256 16000", ++ "32000 1024000000 500 32000", ++ "64000 2048000000 1000 64000" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.hung_task_timeout_secs", ++ "info": { ++ "desc": "1. 当系统频繁出现hung_task警告但实际任务仍在正常执行时,可适当增大该值(如从默认120秒调整为300秒),避免误报\n\n2. 对于存储密集型应用(如数据库服务器),若观察到存储设备响应较慢导致任务频繁超时,应结合存储延迟指标调高该值至存储设备平均响应时间的2-3倍", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 1200 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.nmi_watchdog", ++ "info": { ++ "desc": "1. 在生产服务器上建议禁用该参数(设置为0),因为NMI watchdog会周期性触发NMI中断,可能对系统性能产生轻微影响,尤其在高负载场景下\n\n2. 在调试内核死锁或硬件问题时可以临时启用(设置为1),帮助捕获长时间关中断导致的挂起问题", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.sched_rt_runtime_us", ++ "info": { ++ "desc": "1. 当系统需要运行更多实时任务时,可以适当增加该值(但不超过sched_rt_period_us的95%),默认值950000微秒可提高到990000微秒\n\n2. 当非实时任务出现严重饥饿现象时,应减小该值(建议不低于800000微秒),为普通任务保留更多CPU时间", ++ "type": "continuous", ++ "range": [ ++ 950000, ++ 1000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.timer_migration", ++ "info": { ++ "desc": "1. 在NUMA架构服务器上运行低延迟应用时,若出现跨节点时钟中断导致的性能抖动,应禁用该参数以保持本地CPU处理时钟中断\n\n2. 当系统负载主要集中运行在单个NUMA节点且出现时钟中断处理不均衡时,可启用该参数允许时钟中断在CPU间迁移", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.threads-max", ++ "info": { ++ "desc": "1. 当系统频繁出现\"fork: Cannot allocate memory\"错误或应用程序因无法创建新线程而崩溃时,需要增加该值。可通过计算系统内存容量和单个线程平均内存占用来确定合理上限,通常设置为物理内存(MB)/8。\n\n2. 在高并发容器环境或运行大量轻量级线程的应用(如Java微服务)时,若/proc/sys/kernel/pid_max已调高但仍有线程创建限制,应将该值提升至至少pid_max值的2倍。", ++ "type": "continuous", ++ "range": [ ++ 655360, ++ 65536000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.sysrq", ++ "info": { ++ "desc": "1. 生产环境中建议设置为1(仅启用基本功能)或0(完全禁用),避免通过SysRq组合键意外触发系统操作,降低安全风险\n\n2. 调试崩溃或死机问题时临时设置为1或更大值(如176/128),启用更多调试功能后需立即恢复默认安全配置", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.sched_autogroup_enabled", ++ "info": { ++ "desc": "1. 在服务器环境下建议禁用该参数(设为0),因为自动任务分组主要针对桌面交互程序优化,服务器工作负载通常不需要这种调度特性\n\n2. 当服务器运行大量短时交互式任务且出现调度延迟问题时,可尝试启用(设为1)观察效果,但需注意可能影响批处理任务的吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.numa_balancing", ++ "info": { ++ "desc": "1. 当系统运行NUMA架构且应用存在跨节点内存访问时,应启用该参数(设置为1)以减少远程内存访问延迟\n2. 对于内存密集型且对延迟敏感的应用,若观察到较高比例的跨节点内存访问,建议禁用该参数(设置为0)以避免自动平衡带来的性能波动", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "kernel.randomize_va_space", ++ "info": { ++ "desc": "1. 当系统运行安全性要求较高的服务时,建议保持默认值2(完全随机化),以增强对抗内存攻击的能力\n\n2. 若应用程序出现因地址随机化导致的兼容性问题,且运行环境可信,可临时调整为1(仅对数据段随机化)或0(禁用)进行测试", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "kernel.dmesg_restrict", ++ "info": { ++ "desc": "1. 如果系统需要满足安全合规要求(如PCI-DSS、HIPAA等),建议设置为1以限制普通用户查看内核日志,防止敏感信息泄露\n2. 在需要开发调试或故障排查的环境中,建议设置为0以便非特权用户也能查看完整的系统日志信息", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "vm.swappiness", ++ "info": { ++ "desc": "1. 对于内存密集型应用(如数据库、缓存服务),建议将 vm.swappiness 设置为 10-30 以减少交换空间使用,优先利用物理内存\n2. 当系统频繁发生 OOM (Out of Memory) 时,可适当提高 vm.swappiness 至 60-80 以增加交换空间使用,避免进程被强制终止", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.vfs_cache_pressure", ++ "info": { ++ "desc": "1. 当系统频繁进行目录和inode缓存回收导致性能下降时,可适当降低该值(如设为50-100),减少内核回收缓存内存的频率\n\n2. 当系统内存充足但缓存利用率不足时,可适当提高该值(如设为150-200),促使内核更积极地回收缓存内存", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 500 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.dirty_background_ratio", ++ "info": { ++ "desc": "1. 对于写入密集型应用(如数据库服务器),建议将值从默认的10%提高到15-20%,以减少频繁的后台刷写对I/O性能的影响\n\n2. 对于内存较小的系统(如低于8GB),建议保持默认值或降至5-10%,以避免过多脏页堆积导致内存压力", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.dirty_ratio", ++ "info": { ++ "desc": "1. 当系统频繁因脏页刷盘导致I/O瓶颈时,可适当降低该值(如从默认20%降至10%),以减少单次刷盘的数据量,但会增加刷盘频率\n\n2. 若系统内存较大且主要处理顺序写入负载,可适当提高该值(如升至30%-40%),利用内存缓冲更多脏数据,减少磁盘I/O次数", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.stat_interval", ++ "info": { ++ "desc": "1. 当系统需要更频繁监控内存使用情况(如内存压力大或频繁交换时),可适当减小该值(如从默认10秒降至5秒),但需注意增加的系统开销 \n\n2. 在内存使用稳定且低负载环境中,可增大该值(如调至30秒)以减少/proc/vmstat的更新频率,降低内核开销", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 100 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.dirty_expire_centisecs", ++ "info": { ++ "desc": "1. 对于需要快速持久化数据的应用(如数据库),建议将值调低至100-300(1-3秒),以减少数据丢失风险 \n2. 对于写入密集型负载且对延迟敏感的应用,可适当提高至1000-3000(10-30秒),通过合并更多写操作来提升I/O吞吐量", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.dirty_writeback_centisecs", ++ "info": { ++ "desc": "1. 当系统频繁出现I/O等待或磁盘写入延迟较高时,可适当降低该值(如从默认500调整为200-300),以加快脏页回写频率,减少突发写入导致的性能波动\n\n2. 对于写入密集型负载且使用电池供电的设备(如服务器UPS环境),可适当提高该值(如设置为1000-1500),通过减少磁盘写入次数来降低I/O开销和能耗", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.overcommit_ratio", ++ "info": { ++ "desc": "1. 当物理服务器内存使用率长期低于50%且需要运行大量内存申请不确定的应用程序时,可适当提高该比例(如设置为80-90%)以提升内存利用率\n\n2. 在内存密集型应用场景下,若频繁触发OOM killer且监控显示实际内存使用接近物理内存总量,应降低该比例(如设置为50-70%)以避免过度承诺内存", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 100 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.overcommit_memory", ++ "info": { ++ "desc": "1. 当系统运行内存密集型应用且频繁触发OOM killer时,建议将值设为0(保守策略)或2(严格策略)以避免过度分配\n\n2. 当系统主要运行已知内存需求的批量任务且需要最大化内存利用率时,可设为1(总是允许过度分配)以提升吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "vm.min_free_kbytes", ++ "info": { ++ "desc": "1. 当系统频繁触发直接内存回收(direct reclaim)导致性能下降时,需要增加该值以减少直接回收频率,建议设置为物理内存的1-3%\n\n2. 当系统存在大量不可移动页(unmovable pages)导致内存碎片化严重时,需适当提高该值以预留更多连续内存空间", ++ "type": "continuous", ++ "range": [ ++ 10240, ++ 1024000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.page-cluster", ++ "info": { ++ "desc": "1. 当系统频繁进行大块连续内存交换时,可适当增大该值(默认3,建议范围3-10),减少交换操作的I/O开销\n\n2. 在SSD存储的交换分区环境中,由于随机访问性能较好,可降低该值(建议1-3)以减少单次交换延迟", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 8 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.max_map_count", ++ "info": { ++ "desc": "增加该值当运行内存密集型应用(如Elasticsearch或数据库)时出现\"max virtual memory areas vm.max_map_count [65530] is too low\"错误\n\n将该值设置为262144或更高当运行需要大量内存映射的Java应用(如Hadoop或Spark)时", ++ "type": "continuous", ++ "range": [ ++ 100000, ++ 10000000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.zone_reclaim_mode", ++ "info": { ++ "desc": "1. 当系统运行在NUMA架构且存在跨节点内存访问延迟问题时,建议将vm.zone_reclaim_mode设置为1,优先尝试在本地节点回收内存以减少远程访问延迟\n\n2. 当系统内存压力较大且本地节点回收效果不佳时,建议将vm.zone_reclaim_mode设置为0,允许从其他节点回收内存以提高整体回收效率", ++ "range": [ ++ "0", ++ "1", ++ "2", ++ "4" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "vm.watermark_scale_factor", ++ "info": { ++ "desc": "1. 当系统频繁触发直接内存回收(direct reclaim)且kswapd进程活跃度不足时,可适当降低该值(如从默认的10调整至5-8),使kswapd更早介入内存回收\n\n2. 在内存压力较大且kswapd持续运行的场景下,若观察到系统响应延迟增加,可尝试增大该值(如调整至15-20),延迟kswapd休眠时机以提升回收效率", ++ "type": "continuous", ++ "range": [ ++ 10, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "vm.numa_stat", ++ "info": { ++ "desc": "1. 当系统内存资源紧张且NUMA统计对当前业务场景不重要时,可将该参数设为0以降低统计精度,减少内存开销\n2. 在需要精确监控NUMA内存行为的高性能计算场景中,应保持该参数启用(默认值1)以获得完整统计信息", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "vm.drop_caches", ++ "info": { ++ "desc": "1. 当系统内存压力较大且缓存占用过高时,可临时设置为3释放pagecache、dentries和inodes缓存,但不宜频繁操作以免影响性能\n\n2. 在运行内存密集型应用前,可设置为1仅释放pagecache,避免缓存干扰应用性能测试结果", ++ "range": [ ++ "1", ++ "2", ++ "3" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "fs.inotify.max_user_watches", ++ "info": { ++ "desc": "1. 当监控大量文件或目录时(如日志目录、代码仓库等),若出现\"Too many open files\"或\"User limit of inotify watches reached\"错误,需增加该值\n\n2. 对于高并发文件监控场景(如实时日志分析、文件同步服务),建议将该值调整为默认值(通常8192)的4-8倍,具体数值应根据实际监控文件数量确定", ++ "type": "continuous", ++ "range": [ ++ 4096, ++ 819200 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "fs.nr_open", ++ "info": { ++ "desc": "1. 当应用(如数据库、Web服务器)频繁报告\"too many open files\"错误且ulimit -n已调高时,需增加该值至大于等于进程实际需要的最大文件描述符数\n\n2. 在内存资源紧张的系统中,若该值设置过高(如接近memlock限制),应适当降低以防止内存耗尽", ++ "type": "continuous", ++ "range": [ ++ 10240, ++ 1024000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "fs.file-max", ++ "info": { ++ "desc": "1. 当系统频繁出现\"Too many open files\"错误或监控显示文件句柄使用率持续接近当前限制时,需要增加该值\n\n2. 对于高并发服务(如Web服务器、数据库等),建议将该值设置为物理内存大小(KB)的10%-20%(例如64GB内存可设置为6,400,000-12,800,000)", ++ "type": "continuous", ++ "range": [ ++ 102400, ++ 10240000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "fs.aio-max-nr", ++ "info": { ++ "desc": "- 当系统日志频繁出现\"aio-max-nr reached\"警告或应用程序因异步I/O请求被拒绝而报错时,需要增加该值\n- 对于高并发数据库服务器(如MySQL/PostgreSQL)或大规模文件处理应用,建议将该值设置为(并发线程数×每个线程可能持有的未完成AIO请求数)×2", ++ "type": "continuous", ++ "range": [ ++ 102400, ++ 10240000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "fs.inotify.max_user_instances", ++ "info": { ++ "desc": "1. 当系统日志频繁出现\"inotify instance limit reached\"或类似错误时,表明当前用户运行的监控进程(如文件同步工具、开发热加载工具等)数量超过限制,需要增加该值\n\n2. 对于运行大量容器或微服务的环境,每个容器实例可能需要独立的inotify实例监控文件变化,此时应根据实际容器数量合理调高该参数", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 65535 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "fs.suid_dumpable", ++ "info": { ++ "desc": "1. 当系统需要调试setuid程序崩溃问题时,建议将值设为1(debug模式),允许生成核心转储文件用于故障分析\n\n2. 在注重安全性的生产环境中,建议保持默认值0,避免潜在的安全风险,防止敏感信息通过核心转储泄露", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "blockdev", ++ "info": { ++ "desc": "增大预读值(如设置为8192)可提升顺序读性能,适用于频繁大文件顺序读场景\n\n降低预读值(如设置为128)可减少IO开销,适用于随机访问为主的场景", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2147483648 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.fifo_batch", ++ "info": { ++ "desc": "1. 当系统需要更高吞吐量且能容忍更高延迟时(如批量数据处理场景),可适当增大该值(如32-64) \n2. 当系统对延迟敏感(如实时数据库)且当前吞吐量足够时,可降低该值(如8-12)以减少单个批次的处理延迟", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 128 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.front_merges", ++ "info": { ++ "desc": "1. 在I/O负载主要来自顺序写入且存储设备性能良好时,建议保持默认值1以允许前向合并,这能减少请求数量提升吞吐量\n\n2. 当系统处理大量随机I/O或使用某些特定存储设备时出现性能下降,可尝试将该参数设为0禁用前向合并,减少不必要的合并操作开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.read_expire", ++ "info": { ++ "desc": "1. 当系统主要处理高优先级读操作(如数据库查询)且存在读延迟敏感型应用时,可适当降低该值(如从默认的125ms降至50-100ms),确保读请求能更快得到响应\n\n2. 若系统频繁出现读请求超时丢弃现象(可通过监控deadline调度器的统计信息发现),且存储设备实际响应能力优于当前设置,应适当调高该值(如增至150-200ms)以避免不必要的请求重试", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.writes_starved", ++ "info": { ++ "desc": "1. 当系统主要处理随机读取密集型负载(如数据库服务)且需要低延迟响应时,可适当提高该值(默认2-5范围),优先处理读请求以减少读延迟\n\n2. 当系统存在大量顺序写操作(如日志写入、数据备份)且写性能成为瓶颈时,应降低该值(最小可设为1),防止读请求过度抢占I/O带宽影响写入吞吐量", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 10 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.max_sectors_kb", ++ "info": { ++ "desc": "1. 当使用高性能存储设备(如NVMe SSD)且存在大量大块I/O操作时,可适当增大该值(如1024-4096 KB)以提高吞吐量\n2. 当出现I/O错误或设备驱动不稳定时,应降低该值至默认值512 KB或更小以增强稳定性", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.queue_depth", ++ "info": { ++ "desc": "1. 当使用高性能存储设备(如NVMe SSD)且系统负载较高时,若观察到存储设备利用率不足或IOPS未达预期,可适当增加该值(通常建议从默认32逐步上调至64-256范围),但需确保不超过设备硬件队列深度限制\n\n2. 对于虚拟机环境或低性能旋转磁盘(如HDD),若延迟显著增加或出现请求超时,应将值降低至16-32范围以减少IO堆积", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.nr_requests", ++ "info": { ++ "desc": "1. 当系统有高性能存储设备(如NVMe SSD)且IOPS吞吐量不足时,可适当增加该值(默认128),建议范围256-1024,以充分发挥设备并行处理能力\n\n2. 当系统出现高延迟或请求堆积时,若存储设备为机械硬盘,应降低该值(建议64-128),避免单个设备队列过深导致寻道时间增加", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 2048 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.read_ahead_kb", ++ "info": { ++ "desc": "1. 当系统主要运行顺序读取大文件的应用(如数据库、视频流服务)且内存充足时,可适当增大该值(如从默认128KB调整为512KB-1MB),以减少I/O等待时间\n\n2. 当系统内存压力较大或主要处理随机访问负载时,应降低该值(如调整为64KB或更低),避免预读过多无用数据占用宝贵的内存资源", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.rq_affinity", ++ "info": { ++ "desc": "1. 当系统在高I/O负载下出现CPU利用率不均衡时,建议调整该参数以提高本地CPU处理I/O请求的效率\n\n2. 在使用多队列存储设备时,若发现I/O性能未达到预期,建议调整此参数以充分利用多核CPU的并行处理能力", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.add_random", ++ "info": { ++ "desc": "1. 当系统对随机数质量要求极高且性能开销可接受时,建议启用该参数以增强熵池的随机性来源\n\n2. 在高性能计算或低延迟要求的场景下,若系统已有足够熵源,建议禁用该参数以避免I/O事件带来的额外开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.rotational", ++ "info": { ++ "desc": "1. 当存储设备为SSD时,必须将该参数设置为0,以避免系统错误地应用针对机械硬盘的I/O调度策略\n\n2. 当存储设备为机械硬盘时,该参数应保持默认值1,以确保系统能正确应用适合旋转介质的相关优化", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.scheduler", ++ "info": { ++ "desc": "1. 对于MySQL数据库场景,建议将block.scheduler设置为deadline,该调度算法能更好地处理数据库的随机I/O负载,减少I/O延迟\n\n2. 如果系统使用的是SSD存储设备,可以考虑设置为noop调度器,因为SSD没有机械磁盘的寻道时间,简单的FIFO队列调度即可发挥最佳性能", ++ "range": [ ++ "mq-deadline", ++ "kyber", ++ "bfq", ++ "none" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.write_cache", ++ "info": { ++ "desc": "1. 当系统需要更高的写入性能且能容忍少量数据丢失风险时,建议设置为 write back 模式\n\n2. 当数据安全性要求极高且性能不是首要考虑时,建议设置为 write through 模式", ++ "range": [ ++ "write back", ++ "write through" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.nomerges", ++ "info": { ++ "desc": "1. 仅在调试I/O请求合并相关问题时设置为0,生产环境应保持默认值1以获得合并带来的性能优势 \n2. 当使用blktrace等工具进行底层块设备分析时,可临时禁用合并(设为0)以获取更精确的请求跟踪数据", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "blockdev_multidisk", ++ "info": { ++ "desc": "- 当系统频繁进行大规模顺序读操作且磁盘I/O吞吐量未达预期时,可适当增大该值以提升多磁盘并行预取能力\n- 若系统主要处理随机I/O且内存压力较大时,应减小该值以避免无效预取占用过多内存带宽", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 2147483648 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.fifo_batch_multidisk", ++ "info": { ++ "desc": "1. 在需要提高多磁盘环境下批量I/O吞吐量的场景中(如存储服务器或数据库服务器),可适当增大该值至32-64,但需监控请求延迟是否在可接受范围内\n\n2. 当系统出现明显的I/O延迟敏感型应用性能下降时(如实时事务处理系统),应将该值降低至8-12以减少批量请求带来的延迟影响", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 128 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.front_merges_multidisk", ++ "info": { ++ "desc": "1. 在SSD或多磁盘阵列环境下,若观察到频繁的前向合并导致I/O延迟增加,建议禁用该参数(设为0),因为SSD随机访问性能优异且多磁盘阵列本身具有并行处理能力,前向合并带来的优化效果有限\n\n2. 对于传统机械硬盘且主要处理顺序I/O负载的系统(如数据库日志写入),保持默认值(启用前向合并)即可,因为顺序访问模式能从合并操作中获益", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.read_expire_multidisk", ++ "info": { ++ "desc": "1. 当系统主要处理高优先级读请求且存在多磁盘环境时,若读请求延迟超过预期,可适当降低该值至100-200ms范围以减少读请求等待时间\n\n2. 当系统主要处理顺序大文件读取且磁盘负载较高时,若观察到频繁的请求超时,可适当提高该值至500-1000ms范围以避免不必要的请求重新排队", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.writes_starved_multidisk", ++ "info": { ++ "desc": "1. 当系统主要运行需要高读取性能的应用(如数据库服务)且磁盘I/O压力主要来自读取操作时,可以适当增大该值(例如从默认值2调整为3-5),以提高读取请求的优先级和吞吐量\n\n2. 当系统存在大量混合读写负载且写入延迟敏感型应用(如事务日志)时,应降低该值(例如调整为1),以避免写入请求因读取请求过多而长时间等待", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 10 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.max_sectors_kb_multidisk", ++ "info": { ++ "desc": "1. 当服务器使用高性能存储设备(如NVMe SSD或高端SAN)且存在大量顺序I/O负载时,建议增大该值至2048-4096 KB以提升吞吐量\n\n2. 当系统出现I/O请求合并不足或高延迟问题时,在确保存储设备支持较大传输单元的前提下,可逐步增加该值并监控性能变化", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.queue_depth_multidisk", ++ "info": { ++ "desc": "- 当服务器使用多磁盘阵列且IOPS性能未达预期时,可适当增加该值以提高并行IO处理能力\n- 若系统出现高延迟或请求超时,且监控显示磁盘队列持续满载,应降低该值以避免请求堆积", ++ "type": "continuous", ++ "range": [ ++ 64, ++ 1024 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.nr_requests_multidisk", ++ "info": { ++ "desc": "1. 当系统有多个磁盘且I/O吞吐量不足时,可以适当增加该值以提高并行处理能力,但需确保不超过设备队列深度总和\n2. 在高延迟存储设备场景下,若观察到请求排队时间过长导致性能下降,应考虑降低该值以避免请求积压", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 2048 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.read_ahead_kb_multidisk", ++ "info": { ++ "desc": "1. 当服务器主要运行顺序读取密集型应用(如数据库、日志分析)且内存充足时,可适当增大该值(如512-2048KB),以减少I/O等待时间\n\n2. 当系统内存压力较大或主要处理随机I/O负载时,应降低该值(如128-256KB)以避免缓存污染和内存浪费", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 65536 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "block.rq_affinity_multidisk", ++ "info": { ++ "desc": "1. 当服务器使用多磁盘阵列且CPU负载不均衡时,建议启用该参数以提高I/O并行性\n\n2. 在高并发低延迟要求的存储场景下,若出现CPU核心利用率不均导致I/O瓶颈,建议调整该参数值", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.add_random_multidisk", ++ "info": { ++ "desc": "1. 当系统依赖高强度加密操作(如SSL/TLS通信或加密文件系统)且/dev/random阻塞导致性能下降时,建议启用该参数以增加熵源多样性\n\n2. 在虚拟化环境中宿主机或容器的熵池补充不足时,应启用此参数以利用多磁盘I/O事件增强熵收集效率", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.rotational_multidisk", ++ "info": { ++ "desc": "1. 当存储设备为SSD时,必须将该参数设置为0以避免不必要的I/O调度开销\n2. 对于机械硬盘阵列环境,保持默认值1以优化旋转磁盘的I/O调度", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.scheduler_multidisk", ++ "info": { ++ "desc": "1. 对于MySQL数据库场景,建议将block.scheduler_multidisk设置为deadline调度器,该调度器能有效减少I/O延迟,特别适合需要稳定响应时间的数据库工作负载\n\n2. 若系统使用SSD存储设备,可考虑设置为noop调度器,因为SSD本身具有较低的访问延迟,不需要复杂的调度算法", ++ "range": [ ++ "mq-deadline", ++ "kyber", ++ "bfq", ++ "none" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.write_cache_multidisk", ++ "info": { ++ "desc": "1. 当系统需要更高写入性能且能容忍少量数据丢失风险时,建议启用write back缓存策略\n\n2. 当系统要求数据强一致性且不能容忍任何数据丢失时,建议使用write through缓存策略", ++ "range": [ ++ "write back", ++ "write through" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "block.nomerges_multidisk", ++ "info": { ++ "desc": "1. 当系统在多磁盘环境下出现I/O请求合并导致的性能下降或延迟问题时,建议保持默认值1以启用合并,除非有明确的性能分析表明禁用合并能带来提升\n\n2. 在进行存储子系统调试或性能基准测试需要隔离请求合并影响时,可临时设置为0禁用合并功能", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.core.netdev_budget", ++ "info": { ++ "desc": "1. 当网络接口频繁出现丢包(ifconfig显示RX dropped增加)且CPU软中断(softirq)占用过高时,建议适当增大该值(默认300可尝试调整为600-800)以提升单次软中断处理的包数量,减少中断次数\n\n2. 在低吞吐量但延迟敏感型场景(如高频交易系统)中,若网络延迟出现波动,可尝试降低该值(如调整为150-200)以减少单次软中断处理时间,降低处理延迟", ++ "type": "continuous", ++ "range": [ ++ 100, ++ 1000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.optmem_max", ++ "info": { ++ "desc": "- 当应用程序(如高性能网络服务)需要处理大量并发连接或大容量数据时,若出现 socket 缓冲区不足导致的性能瓶颈,可适当增加该值\n- 在内存资源充足的服务器上,若默认值(通常为 20480)无法满足特定应用(如视频流、大数据传输)的需求,可按 2 的幂次方逐步调高至合理范围(如 65536 或 131072)", ++ "type": "continuous", ++ "range": [ ++ 20480, ++ 204800 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.wmem_max", ++ "info": { ++ "desc": "1. 当服务器处理大量高吞吐量网络连接(如视频流、大文件传输等场景)时出现写缓冲区不足导致的性能瓶颈,建议将值从默认229376调整为16777216\n\n2. 在高并发TCP长连接场景(如WebSocket服务、消息队列等)中观察到因写缓冲区溢出导致的连接异常或数据丢失时,建议采用16777216值", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 67108864 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.wmem_default", ++ "info": { ++ "desc": "1. 当应用主要处理大量小数据包传输时,若网络吞吐量低于预期且系统监控显示发送缓冲区频繁填满,可适当增大该值至32768-65535字节范围,减少频繁缓冲区填满导致的延迟 \n\n2. 在高带宽高延迟网络环境下(如跨数据中心传输),若TCP窗口缩放功能已启用但实际窗口仍受限于默认值,应将该值提升至至少163840字节(160KB)以匹配BDP(带宽延迟积)需求", ++ "type": "continuous", ++ "range": [ ++ 8192, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.rmem_default", ++ "info": { ++ "desc": "1. 当应用需要处理大量网络数据流(如视频流、大数据传输)且观察到频繁的TCP窗口缩放或重传时,建议将值从默认的212992字节提升至1-4MB范围(1048576-4194304字节),需配合net.core.rmem_max同步调整\n\n2. 在高吞吐低延迟网络环境(如10Gbps以上)中,若netstat -s显示\"pruned\"或\"collapsed\"包统计持续增长,建议将值设置为BDP(带宽延迟积)的1/4至1/2,计算公式为:(带宽(bps) × 往返时延(s)) / 8 × 0.25", ++ "type": "continuous", ++ "range": [ ++ 8192, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.rmem_max", ++ "info": { ++ "desc": "1. 当应用需要处理高吞吐量网络数据流(如视频流、大数据传输)时,应将此值调整为16777216以提升接收性能 \n2. 在存在大量TCP长连接且频繁出现接收缓冲区不足警告(如内核日志报\"TCP: too much of memory\")时,应增大该值", ++ "type": "continuous", ++ "range": [ ++ 1048576, ++ 67108864 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.netdev_max_backlog", ++ "info": { ++ "desc": "1. 当服务器频繁出现网络丢包或高负载时,且通过监控发现 netdev_backlog 值持续接近或达到当前 netdev_max_backlog 设置值,应适当增大该参数值(例如从默认的1000调整为2000-3000) \n\n2. 对于10Gbps及以上高速网络接口,若默认值导致数据包处理延迟增加,需根据实际网络吞吐量和CPU处理能力按比例提升该参数值", ++ "type": "continuous", ++ "range": [ ++ 1000, ++ 100000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_thin_linear_timeouts", ++ "info": { ++ "desc": "1. 当服务器处理大量短生命周期TCP连接且频繁出现超时重传时,建议启用该参数(tcp_thin_linear_timeouts=1)以更精确检测瘦流并减少不必要的重传等待时间 \n\n2. 若服务器主要处理大文件传输或视频流等持续高吞吐连接,建议保持默认值(tcp_thin_linear_timeouts=0)以避免对正常数据流产生误判", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.unix.max_dgram_qlen", ++ "info": { ++ "desc": "1. 当系统频繁处理大量UDP数据报且出现丢包现象时,应考虑增加该值以提高队列容量\n2. 在高吞吐量UDP应用场景中,若观察到应用处理速度跟不上数据接收速度导致队列溢出,应适当调高此参数", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.somaxconn", ++ "info": { ++ "desc": "1. 当服务器需要处理大量并发连接请求(如高负载Web服务器)且出现连接被丢弃或排队延迟时,应将此值从默认128增大到1024或更高\n\n2. 在运行需要频繁建立短连接的服务(如反向代理、负载均衡器)时,建议将该值调整为至少等于或大于服务的worker_processes与worker_connections乘积的1/4", ++ "type": "continuous", ++ "range": [ ++ 128, ++ 65536 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.busy_poll", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境中(如10Gbps以上),若CPU利用率不足且存在延迟敏感型应用,可适当增加该值(如50-100微秒)以减少中断频率,但需监控CPU负载避免过度占用\n\n2. 在低延迟网络环境(如高频交易系统)中,若网络延迟指标不达标且CPU资源充足,可尝试设置为0禁用该功能,强制使用中断模式降低延迟", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 200 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.busy_read", ++ "info": { ++ "desc": "1. 当网络设备处理高吞吐量小包时出现频繁读超时或性能下降,可尝试增加该值至100-200微秒范围,需结合具体硬件性能测试确定最优值\n\n2. 在低延迟网络环境中若观察到CPU使用率异常升高且与网络中断处理相关,可测试降低该值至20-30微秒以减少等待时间", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 200 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.core.dev_weight", ++ "info": { ++ "desc": "1. 当网络中断处理成为性能瓶颈时(通过监控发现CPU软中断时间占比过高),可适当增加该值以提高单次中断处理的数据包数量,但需注意避免单个CPU过载\n\n2. 对于高吞吐量网卡(如10G/25G以上)或大量小包场景,建议将该值从默认的64提高到128-256范围,需结合具体硬件和负载测试确定最优值", ++ "type": "continuous", ++ "range": [ ++ 16, ++ 1024 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_keepalive_intvl", ++ "info": { ++ "desc": "1. 当服务器需要检测长时间空闲连接的健康状态时,若默认值75秒导致故障检测延迟过高,可适当减小至30-60秒范围以加快故障发现,但需权衡网络负载增加的影响\n\n2. 在高延迟网络环境中,若频繁出现误判连接中断的情况,可考虑增大该值至90-120秒范围以减少不必要的探测流量,同时需配合调整tcp_keepalive_probes确保总体检测窗口合理", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 300 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_keepalive_probes", ++ "info": { ++ "desc": "1. 当服务器需要快速检测并释放失效连接(如负载均衡器后端健康检查场景)时,可适当减少该值(默认9),建议调整为3-5次以加快失效连接回收\n\n2. 在高延迟或不可靠网络环境中(如跨国VPN),为防止误判活跃连接为失效,应增大该值至12-15次,同时配合调整tcp_keepalive_time和tcp_keepalive_intvl参数", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 144 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_keepalive_time", ++ "info": { ++ "desc": "1. 当服务器需要检测长时间空闲连接的有效性时(如负载均衡器或反向代理场景),若默认值(7200秒)过长可能导致无效连接占用资源,可适当调低至300-600秒区间 \n\n2. 在高并发短连接业务场景下,若出现大量TIME_WAIT状态连接导致端口耗尽,可配合减小tcp_keepalive_probes和tcp_keepalive_intvl参数,将本参数值提升至10800秒以上以减少keepalive探测频率", ++ "type": "continuous", ++ "range": [ ++ 600, ++ 36000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_tw_reuse", ++ "info": { ++ "desc": "1. 当服务器面临大量短连接请求且TIME-WAIT状态连接过多导致端口耗尽时,建议启用该参数(设置为1)以复用TIME-WAIT套接字\n2. 在NAT网络环境下或需要严格保证TCP连接可靠性的场景下,建议保持该参数为默认值0以避免潜在连接混乱风险", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_window_scaling", ++ "info": { ++ "desc": "1. 在高带宽或高延迟网络环境下(如长距离传输或高速网络),应确保该参数值为1以启用窗口缩放功能,提升大窗口TCP连接性能\n\n2. 当网络设备不支持RFC 1323或存在兼容性问题时,应将该参数设为0以禁用窗口缩放功能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_fin_timeout", ++ "info": { ++ "desc": "1. 当服务器需要处理大量短连接请求时,如果观察到大量连接处于FIN_WAIT_2状态导致端口耗尽,建议将该值从默认的60秒降低到30秒或更低,以加快连接资源释放\n\n2. 对于高延迟网络环境或需要保持长时间连接的应用场景,如果发现连接异常终止问题,建议适当增加该值至120秒以上,确保连接正常关闭", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 120 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.udp_mem", ++ "info": { ++ "desc": "1. 当服务器频繁处理大量UDP流量(如DNS服务器、视频流服务器)且出现丢包或性能下降时,可适当增加high值(如默认值的2-3倍),确保有足够内存缓冲队列数据包\n\n2. 若系统空闲内存充足但UDP应用仍频繁触发压力模式(可通过监控/proc/net/sockstat观察),应按比例同步提高low和assure值(如low设为总内存的1%,assure设为2%)以避免不必要的内存回收抖动", ++ "range": [ ++ "12582912 16777216 25165824", ++ "25165824 33554432 50331648", ++ "50331648 100663296" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_mem", ++ "info": { ++ "desc": "1. 当系统在高并发TCP连接场景下出现内存不足或频繁触发OOM killer时,应适当增加三个值(最小压力值/压力阈值/最大值),建议按总物理内存的1%-3%计算,并确保最大值不超过系统可用内存的50%\n\n2. 若系统出现TCP性能下降或连接被拒绝(尤其在高吞吐量场景),需检查当前值是否过小,建议将最小值设为当前活跃连接内存占用的1.5倍,最大值设为系统空闲内存的30%-40%", ++ "range": [ ++ "6168306 8224411 12336612", ++ "12336612 16448822 24673224" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_rmem", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境中(如视频流服务器、大数据传输节点),当默认最大值6291456(6MB)导致TCP接收窗口成为瓶颈时,建议将第三个值调整为16777216(16MB)以提升吞吐量\n\n2. 对于内存资源受限的服务器(如云主机或容器环境),若默认值87380(85KB)的初始缓冲区导致内存压力,可将中间值降至65536(64KB)以平衡性能与资源消耗", ++ "range": [ ++ "4096 16384 4194304", ++ "4096 32768 8388608", ++ "4096 65536 16777216" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_wmem", ++ "info": { ++ "desc": "1. 在高吞吐量网络环境中(如视频流服务器、文件传输服务器),建议将参数调整为 4096 65536 16777216,以提升大流量场景下的TCP写缓冲区性能\n\n2. 对于内存资源受限的服务器(如云主机或容器环境),若出现内存压力时应适当降低最大值(如调整为 4096 32768 8388608),避免TCP写缓冲区占用过多内存", ++ "range": [ ++ "4096 16384 4194304", ++ "4096 32768 8388608", ++ "4096 65536 16777216" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_fastopen", ++ "info": { ++ "desc": "1. 当服务器主要处理大量短连接请求(如HTTP API服务)且需要降低TCP握手延迟时,建议启用该参数(值为3同时支持客户端和服务器端)\n\n2. 当服务器处于严格安全环境或处理敏感数据时,建议禁用该参数(值为0)以避免潜在的安全风险", ++ "range": [ ++ "1", ++ "2", ++ "4" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_synack_retries", ++ "info": { ++ "desc": "1. 当服务器处于高并发连接场景且出现大量SYN_RECV状态连接时,若网络延迟较高,可适当增加该值(默认5)至7-10次,确保在拥塞环境下完成三次握手\n\n2. 若服务器遭受SYN Flood攻击或处于高负载状态,可降低该值至2-3次以快速释放半连接资源,减少SYN队列占用时间", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_syn_retries", ++ "info": { ++ "desc": "1. 在延迟较高或不稳定的网络环境中(如跨国网络或移动网络),建议将默认值6适当增加到8-10,以应对可能出现的SYN丢包情况,但需注意这会延长连接建立失败时的等待时间\n\n2. 对于内网或低延迟高可靠网络环境,建议降低到3-4以减少连接建立超时等待时间,提高应用响应速度", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 64 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_moderate_rcvbuf", ++ "info": { ++ "desc": "1. 当应用需要处理大量突发流量时,建议启用该参数(设置为1),系统会自动调整接收缓冲区大小以适应流量变化\n2. 在内存资源受限的环境中,建议禁用该参数(设置为0),避免系统自动扩大接收缓冲区导致内存压力增加", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_timestamps", ++ "info": { ++ "desc": "1. 在存在NAT设备或负载均衡器的网络环境中,建议禁用该参数(设置为0),以避免可能的时间戳冲突导致的连接问题\n2. 在高速低延迟的内网环境中,建议启用该参数(设置为1),以获得更精确的RTT计算和更好的TCP性能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_dsack", ++ "info": { ++ "desc": "1. 在延迟敏感型应用环境中(如高频交易系统),建议设置为0(禁用)以减少不必要的ACK确认包带来的网络开销\n\n2. 在常规Web服务或文件传输场景下保持默认值1(启用),可帮助处理网络丢包情况下的数据重传效率", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_fack", ++ "info": { ++ "desc": "1. 在存在高延迟或高丢包率的网络环境中,建议启用该参数以改善TCP重传性能,通过选择性确认减少不必要的重传\n\n2. 当服务器作为高性能网络应用(如视频流、大文件传输)的接收端时,建议保持启用状态以优化TCP拥塞控制机制", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_sack", ++ "info": { ++ "desc": "1. 在广域网(WAN)通信环境下应保持启用(1),可显著改善高延迟或丢包网络中的TCP传输性能,即使会略微增加CPU负载\n2. 在低延迟、高带宽的局域网(LAN)环境中可考虑禁用(0),特别是当系统CPU资源已高度饱和且网络质量极佳时", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_low_latency", ++ "info": { ++ "desc": "在高吞吐量且对延迟敏感的集群环境(如Beowulf集群)中应启用(设置为1) \n在普通网络环境下保持禁用(设置为0)以避免不必要的开销", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_adv_win_scale", ++ "info": { ++ "desc": "1. 当应用需要处理大量小包网络流量时,若观察到TCP接收窗口利用率不足,可考虑增大该值(如设为2或3),以减少缓冲区开销比例,提升小包传输效率\n\n2. 在内存资源紧张的服务器环境中,若发现TCP内存消耗过高导致系统频繁OOM,可适当降低该值(如设为1或0),增加缓冲区开销比例以降低内存使用量", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 4 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.route.max_size", ++ "info": { ++ "desc": "1. 当服务器频繁处理大量网络连接或作为路由器转发大量数据包时,若观察到路由缓存频繁刷新导致性能下降,可适当增加该值(默认值4096可逐步倍增测试)\n\n2. 在高内存压力环境下,若路由表占用内存过高影响其他服务,且实际活跃路由条目远低于当前设置值,可适当降低该值以释放内存", ++ "type": "continuous", ++ "range": [ ++ 67108864, ++ 2080374784 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_max_tw_buckets", ++ "info": { ++ "desc": "1. 当服务器出现大量TIME_WAIT状态的TCP连接导致端口耗尽或性能下降时,建议将net.ipv4.tcp_max_tw_buckets从默认值2048调整为360000。\n\n2. 在高并发短连接场景下,若监控发现TIME_WAIT连接数频繁达到上限,可适当增大该值至360000以提升连接处理能力。", ++ "type": "continuous", ++ "range": [ ++ 32768, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_max_syn_backlog", ++ "info": { ++ "desc": "1. 当服务器频繁处理大量新连接请求且出现 SYN 包丢弃时,应考虑增大该值至 8192 或更高\n\n2. 在高并发短连接场景下,若监控发现 SYN_RECV 状态连接数常接近默认值 2048,应调整该参数以避免连接建立延迟", ++ "type": "continuous", ++ "range": [ ++ 1024, ++ 262144 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_max_orphans", ++ "info": { ++ "desc": "1. 当服务器频繁出现\"Out of socket memory\"错误或日志中出现大量orphaned sockets警告时,需要增加该值。建议根据当前系统内存容量调整,通常设置为内存容量的1/4对应的socket数量(每个orphan约占用64KB内存)\n\n2. 对于高并发短连接服务(如HTTP服务器、负载均衡器),若观察到tcp_max_orphans限制成为性能瓶颈(通过监控/proc/net/sockstat中orphan数量接近上限),应适当调高该值至并发连接数的1.2-1.5倍", ++ "type": "continuous", ++ "range": [ ++ 65536, ++ 16777216 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_ecn", ++ "info": { ++ "desc": "1. 当网络中存在不支持ECN的老旧网络设备时,建议保持默认值0(禁用),以避免潜在的数据包丢弃问题\n\n2. 在确认网络设备完全支持ECN且需要降低TCP重传率的环境中,建议设置为1(启用)以获得更好的拥塞控制性能", ++ "range": [ ++ "0", ++ "1", ++ "2" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.ip_forward", ++ "info": { ++ "desc": "- 当服务器需要作为路由器或VPN网关时,应设置为1以启用IPv4转发功能\n- 当服务器仅作为终端主机使用时,应保持默认值0以禁用转发功能,减少潜在安全风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.default.rp_filter", ++ "info": { ++ "desc": "1. 当服务器作为路由器或需要处理多路径网络流量时,建议将rp_filter设置为2(宽松模式),以避免严格的反向路径验证导致合法流量被丢弃\n2. 在单网卡服务器且网络环境可信的情况下,可以设置为0(关闭验证)以减少内核处理开销,但需确保网络环境无IP欺骗风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.ip_local_port_range", ++ "info": { ++ "desc": "1. 当服务器需要处理大量并发连接(如高负载代理服务器或Web服务器)时,默认的32768-60999端口范围可能导致端口耗尽,此时应扩大范围(如1024-65535),但需保留1024以下端口给特权服务\n\n2. 在容器化环境中运行多个实例时,为避免端口冲突,需要为每个实例分配不重叠的本地端口范围,同时确保总范围不超过系统上限", ++ "range": [ ++ "32768 60999", ++ "1024 65535", ++ "8192 65535" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_no_metrics_save", ++ "info": { ++ "desc": "1. 在高并发短连接场景下,建议设置为1以禁用TCP连接参数保存,避免因大量无效参数缓存导致内存浪费和性能下降\n2. 在需要保持长连接稳定性的场景下,建议保持默认值0,允许重用之前连接的有效参数来优化新连接建立性能", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.ip_default_ttl", ++ "info": { ++ "desc": "1. 当网络中存在多层NAT或复杂路由环境时,若出现数据包提前被丢弃的情况,可考虑将TTL值提高到128,确保数据包能到达更远的网络节点\n\n2. 对于需要限制数据包传播范围的场景(如内部测试网络),可降低TTL值至32以下,防止数据包在网络中过度传播", ++ "type": "continuous", ++ "range": [ ++ 8, ++ 128 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.ip_no_pmtu_disc", ++ "info": { ++ "desc": "1. 当网络中存在路径MTU发现(PMTUD)问题导致连接超时或性能下降时,建议将该参数设为1以禁用PMTUD,避免因ICMP黑洞或防火墙丢弃数据包导致的连接问题\n\n2. 在高速网络环境(如10Gbps以上)且网络设备可靠支持PMTUD时,建议保持默认值0以启用PMTUD,确保TCP能动态发现最优MTU值提升吞吐量", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_retries2", ++ "info": { ++ "desc": "1. 对于高延迟或不稳定网络环境中的服务器,可考虑将值降低到5-8,减少因网络临时故障导致的连接长时间挂起问题\n2. 对于需要快速检测连接失效的金融交易类服务器,建议设置为3-5,确保能更快释放失效连接资源", ++ "type": "continuous", ++ "range": [ ++ 3, ++ 30 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_orphan_retries", ++ "info": { ++ "desc": "1. 当服务器面临大量半连接(orphaned sockets)导致资源占用过高时,可适当降低该值(如3-5),加速释放资源,但需注意过低可能导致正常长延迟网络下的连接被过早丢弃\n\n2. 若服务器主要处理本地或低延迟网络通信,且出现过多重试浪费资源的情况,可降至2-3次以减少不必要的等待时间", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 15 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_syncookies", ++ "info": { ++ "desc": "1. 当服务器频繁遭受SYN flood攻击时,应启用该参数(设置为1),以保护系统资源不被耗尽\n2. 在正常网络环境下且未遭受攻击时,建议保持默认值(通常为1),因为启用syncookies可能导致TCP连接性能略微下降", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_reordering", ++ "info": { ++ "desc": "1. 当网络路径存在较高丢包率或频繁重排序时,若观察到TCP重传率明显上升且吞吐量下降,应考虑适当增大该值(默认3可尝试调整为9-12),以容忍更多乱序数据包而非错误触发快速重传\n\n2. 在低延迟网络环境(如数据中心内部)且使用TSO/GRO等卸载技术时,若内核日志频繁出现\"TCP: too many of order packets\"警告,可将该值适度降低(如调整为6-8),减少乱序队列内存占用", ++ "type": "continuous", ++ "range": [ ++ 2, ++ 10 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_retrans_collapse", ++ "info": { ++ "desc": "1. 在Linux服务器环境中,若确认无老旧打印机设备需要兼容,建议禁用此参数以优化TCP重传性能\n\n2. 当网络吞吐量出现异常下降且排查其他因素无果时,可尝试禁用此参数观察是否由打印机兼容性功能引起", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_congestion_control", ++ "info": { ++ "desc": "1. 在高带宽、高延迟网络环境下(如跨数据中心通信),建议将默认的\"cubic\"算法切换为\"bbr\",可更充分利用带宽并减少排队延迟\n\n2. 在无线网络或移动网络环境中,若出现频繁丢包,建议使用\"vegas\"或\"westwood\"算法,这些算法对丢包区分更准确,能避免误判拥塞", ++ "range": [ ++ "cubic", ++ "reno", ++ "bbr" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.default.promote_secondaries", ++ "info": { ++ "desc": "1. 当服务器需要保持高可用性且依赖多个IP地址时,建议设置为1,确保主IP被移除时次IP能自动提升为主IP,避免服务中断\n\n2. 在安全性要求严格的环境中建议设置为0,确保主IP被移除时所有关联IP都被清除,防止潜在的安全风险", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.all.promote_secondaries", ++ "info": { ++ "desc": "1. 当服务器需要保持高可用性且依赖多个IP地址时,建议设置为1,以确保主IP被移除时次IP能自动提升为主IP,避免服务中断\n\n2. 当服务器IP地址管理需要严格遵循变更控制流程时,建议保持默认值0,以确保任何IP地址变更都需要明确操作,防止意外配置变更", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.all.accept_redirects", ++ "info": { ++ "desc": "1. 对于作为网关或路由器的Linux服务器,建议设置为0以禁用ICMP重定向消息,防止潜在的网络拓扑欺骗攻击\n2. 对于普通主机服务器,若网络环境可信且需要ICMP重定向功能优化路由,可保持默认值1;否则建议设置为0增强安全性", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.default.accept_redirects", ++ "info": { ++ "desc": "1. 在作为路由器使用时,建议设置为0以禁用ICMP重定向消息,防止潜在的网络拓扑混淆和安全风险\n2. 在作为终端主机使用时,可保持默认值1以接受重定向消息,但若网络环境安全要求较高,建议同样设置为0", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.all.secure_redirects", ++ "info": { ++ "desc": "- 在安全要求较高的生产环境中,建议设置为0禁用,避免潜在的安全风险\n- 若网络环境需要接收特定ICMP重定向且信任网关,可设置为1但需配合其他安全措施", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.default.secure_redirects", ++ "info": { ++ "desc": "1. 在安全要求较高的生产环境中建议设置为0,防止潜在的网络重定向攻击\n2. 如果服务器需要接收来自可信网关的ICMP重定向消息以优化路由,可设置为1", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.ipv4.icmp_echo_ignore_broadcasts", ++ "info": { ++ "desc": "1. 如果服务器处于可能遭受ICMP广播风暴攻击的网络环境(如公开网络或DMZ区域),建议设置为1以避免资源耗尽\n\n2. 如果服务器位于受保护的内网且需要接收ICMP广播(如网络设备发现等场景),建议设置为0以保持功能正常", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.nf_conntrack_max", ++ "info": { ++ "desc": "1. 当服务器处理大量网络连接(如高并发代理、NAT网关或防火墙)且频繁出现\"nf_conntrack: table full\"日志时,需要增加该值以避免连接跟踪表溢出\n\n2. 当系统内存不足且连接跟踪表利用率持续低于50%时,可适当降低该值以释放内存资源", ++ "type": "continuous", ++ "range": [ ++ 65536, ++ 1048576 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.netfilter.nf_conntrack_tcp_timeout_established", ++ "info": { ++ "desc": "1. 当服务器处理大量持久TCP连接(如长连接服务、代理服务器等)且观察到nf_conntrack表频繁满导致丢包时,可适当增大该值(默认43200秒/12小时),但需确保不超过客户端实际连接保持时间,避免无效连接占用资源\n\n2. 对于短连接为主的Web服务器环境,若系统内存压力较大且连接跟踪表占用过高,可适当降低该值(但不应小于1800秒),以加速连接表项回收", ++ "type": "continuous", ++ "range": [ ++ 108000, ++ 1728000 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.netfilter.nf_conntrack_tcp_timeout_close_wait", ++ "info": { ++ "desc": "1. 当服务器处理大量短连接且频繁出现close_wait状态时,若该值过大(默认240秒)会导致连接资源长时间占用,可适当降低至60-120秒范围\n\n2. 对于长连接为主的服务器环境,若发现连接异常断开导致资源泄漏,可考虑增大该值至300-600秒范围", ++ "type": "continuous", ++ "range": [ ++ 15, ++ 240 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.netfilter.nf_conntrack_tcp_timeout_fin_wait", ++ "info": { ++ "desc": "1. 当服务器处理大量短连接且频繁出现FIN_WAIT状态连接堆积时,若系统日志显示nf_conntrack表频繁满导致丢包,可适当减少该值至30-60秒范围以加速连接回收\n\n2. 若服务器主要处理长连接且并发连接数远低于nf_conntrack_max的80%,出现FIN_WAIT状态连接过早超时导致异常断开时,可增大该值至120-300秒范围确保正常连接终止流程完成", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 480 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.netfilter.nf_conntrack_tcp_timeout_time_wait", ++ "info": { ++ "desc": "1. 当服务器处理大量短连接且出现大量TIME_WAIT状态连接导致nf_conntrack表满时,可适当降低该值(默认120秒),建议调整为30-60秒以更快释放连接跟踪条目\n\n2. 若服务器作为反向代理或负载均衡器且出现端口耗尽问题,在确认无重传包风险后可考虑将该值降至15-30秒,但需确保大于TCP的2MSL时间", ++ "type": "continuous", ++ "range": [ ++ 30, ++ 480 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.conf.default.forwarding", ++ "info": { ++ "desc": "1. 当服务器需要作为路由器或网关转发IPv4流量时,应将该参数设置为1,否则保持默认值0以关闭转发功能提升安全性\n2. 在容器或虚拟化环境中,若宿主机需要为虚拟机/容器提供网络转发功能,则需启用该参数", ++ "range": [ ++ "0", ++ "1" ++ ], ++ "type": "discrete", ++ "dtype": "string" ++ } ++ }, ++ { ++ "name": "net.core.rps_sock_flow_entries", ++ "info": { ++ "desc": "1. 当服务器处理大量网络连接且RPS/RFS功能开启时,若出现CPU缓存命中率下降或网络延迟增加,应考虑增加该值(通常建议设置为32768或65536)\n\n2. 在高吞吐量网络环境下(如10Gbps以上),若网络性能未达预期且/proc/net/softnet_stat显示drop计数增长,应将该值调整为至少等于或大于预期并发连接数", ++ "type": "continuous", ++ "range": [ ++ 0, ++ 131072 ++ ], ++ "dtype": "int" ++ } ++ }, ++ { ++ "name": "net.ipv4.tcp_min_tso_segs", ++ "info": { ++ "desc": "1. 当服务器主要处理大量小数据包(如小于1460字节)且TSO利用率低时,可适当降低该值(默认2)以减少延迟,但需确保不低于1以避免性能下降\n\n2. 对于高速网络(10Gbps+)且处理大数据传输的场景,若观察到TSO分段不足导致CPU利用率过高,可适当增大该值(建议不超过8)以提升吞吐量", ++ "type": "continuous", ++ "range": [ ++ 1, ++ 16 ++ ], ++ "dtype": "int" ++ } ++ } ++] +\ No newline at end of file +diff --git a/copilot-tune/src/performance_analyzer/__init__.py b/copilot-tune/src/performance_analyzer/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/performance_analyzer/app_analyzer.py b/copilot-tune/src/performance_analyzer/app_analyzer.py +new file mode 100644 +index 0000000..927621e +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/app_analyzer.py +@@ -0,0 +1,77 @@ ++from src.performance_analyzer.base_analyzer import BaseAnalyzer ++ ++ ++class AppAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ def analyze(self) -> str: ++ if not self.data: ++ return f"当前系统没有运行{self.app}应用,无需分析ceph性能。\n" ++ report = f"基于采集的系统指标, {self.app}初步的性能分析如下:\n" ++ for cmd, result in self.data.items(): ++ profile_prompt = f""" ++ # CONTEXT # ++ 以下内容是linux命令<{cmd}>的输出: ++ {result} ++ ++ # OBJECTIVE # ++ 请根据上述信息,简要分析{self.app}应用的性能状况。 ++ 要求: ++ 1.答案不超过200字。 ++ 2.答案中不要包含任何优化建议。 ++ 3.答案中尽可能保留信息中真实有效的数据。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ report += self.ask_llm(profile_prompt) ++ ++ return report ++ ++ def generate_report( ++ self, ++ app_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ if app_report == "当前系统没有运行ceph应用,无需分析ceph性能。\n": ++ return app_report ++ report_prompt = f""" ++ # CONTEXT # ++ linux系统中正在运行{self.app}应用, 以下内容是{self.app}相关的性能信息: ++ {app_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析{self.app}应用的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"{self.app}分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +diff --git a/copilot-tune/src/performance_analyzer/base_analyzer.py b/copilot-tune/src/performance_analyzer/base_analyzer.py +new file mode 100644 +index 0000000..4865039 +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/base_analyzer.py +@@ -0,0 +1,38 @@ ++from abc import ABC, abstractmethod ++from typing import Dict, Any ++ ++from src.utils.llm import get_llm_response ++ ++ ++class BaseAnalyzer(ABC): ++ def __init__(self, app: str, data: Dict[str, Any]): ++ self.data = data ++ self.app = app ++ ++ @abstractmethod ++ def analyze(self, **kwargs) -> str: ++ pass ++ ++ @abstractmethod ++ def generate_report(self, **kwargs) -> str: ++ pass ++ ++ def ask_llm( ++ self, ++ prompt: str ++ ) -> str: ++ return get_llm_response(prompt) ++ ++ def generate_report_line( ++ self, ++ condition: Any, ++ message: str, ++ ) -> str: ++ if condition: ++ return message + "\n" ++ return "" ++ ++ def run(self) -> str: ++ analyze_result = self.analyze() ++ report = self.generate_report(analyze_result) ++ return report +diff --git a/copilot-tune/src/performance_analyzer/cpu_analyzer.py b/copilot-tune/src/performance_analyzer/cpu_analyzer.py +new file mode 100644 +index 0000000..472d0cd +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/cpu_analyzer.py +@@ -0,0 +1,176 @@ ++from .base_analyzer import BaseAnalyzer ++ ++class CpuAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ def analyze(self) -> str: ++ report = "基于采集的系统指标, CPU初步的性能分析如下:\n" ++ avg_load_report = self.avg_load_analysis() ++ cpu_info_report = self.cpu_info_analysis() ++ pid_info_report = self.pid_info_analysis() ++ ++ report += avg_load_report ++ report += cpu_info_report ++ report += pid_info_report ++ ++ return report ++ ++ def avg_load_analysis(self) -> str: ++ avg_load_analysis_report = "" ++ ++ # 提取cpu平均负载数据 ++ one_min, five_min, ten_min = self.data.get("1min", 0.0), self.data.get("5min", 0.0), self.data.get("10min", 0.0) ++ ++ # 生成平均负载数据 ++ avg_load_analysis_report += f"当前系统1分钟平均负载是{one_min}, 5分钟平均负载是{five_min}, 10分钟平均负载是{ten_min}\n" ++ ++ # 生成报告 ++ avg_load_analysis_report += ( ++ self.generate_report_line(one_min > 1, "过去1分钟系统负载过重,系统可能存在cpu性能瓶颈") ++ ) ++ avg_load_analysis_report += ( ++ self.generate_report_line(five_min > 1, "过去5分钟系统负载过重,系统可能存在cpu性能瓶颈") ++ ) ++ avg_load_analysis_report += ( ++ self.generate_report_line(ten_min > 1, "过去10分钟系统负载过重,系统可能存在cpu性能瓶颈") ++ ) ++ ++ # 检查负载是否突然增加 ++ sudden_increase_message = ( ++ "过去1分钟系统负载突然迅速增加,系统对cpu性能要求可能会变高" ++ if (one_min > 2 * five_min or one_min > 2 * ten_min) and one_min > 1 ++ else "" ++ ) ++ avg_load_analysis_report += self.generate_report_line(sudden_increase_message, sudden_increase_message) ++ ++ # 检查负载稳定性 ++ stability_message = ( ++ "过去10分钟内系统负载较稳定,无明显波动" ++ if abs(one_min - five_min) <= 0.2 and abs(one_min - ten_min) <= 0.2 and abs(five_min - ten_min) <= 0.2 ++ else "过去10分钟内系统负载存在一定变化波动" ++ ) ++ avg_load_analysis_report += self.generate_report_line(stability_message, stability_message) ++ ++ # 检查负载上升趋势 ++ trend_message = ( ++ "过去10分钟内系统负载呈不断上升趋势,系统对cpu性能要求可能会变高" ++ if one_min - five_min > 0.2 and five_min - ten_min > 0.2 and one_min > 0.5 ++ else "" ++ ) ++ avg_load_analysis_report += self.generate_report_line(trend_message, trend_message) ++ ++ return avg_load_analysis_report ++ ++ def cpu_info_analysis(self) -> str: ++ cpu_info_analysis_report = "" ++ ++ # 提取CPU信息数据 ++ usr, sys, irq, soft, util = ( ++ self.data.get("用户态中的cpu利用率", 0), ++ self.data.get("kernel内核态执行时的CPU利用率", 0), ++ self.data.get("硬中断占用CPU时间的百分比", 0), ++ self.data.get("软中断占用CPU时间的百分比", 0), ++ self.data.get("CPU利用率", 0) ++ ) ++ block_process, cpu_load, io_load = ( ++ self.data.get("阻塞进程率", 0), ++ self.data.get("计算密集型", 0), ++ self.data.get("IO密集型", 0) ++ ) ++ context_switch, sys_call, cpu_num = ( ++ self.data.get("系统每秒进行上下文切换的次数", 0), ++ self.data.get("系统单位时间调用次数", 0), ++ self.data.get("cpu核数", 1) # 默认为1,避免除以0 ++ ) ++ ++ # 构建基本信息报告 ++ cpu_info_analysis_report += ( ++ f"当前系统中, 用户态CPU利用率: {usr}%, 内核态CPU利用率: {sys}%, " ++ f"硬中断占比: {irq}%, 软中断占比: {soft}%, CPU总体利用率: {util}%\n" ++ ) ++ ++ # 根据条件生成其他报告行 ++ conditions_and_messages = [ ++ (usr + sys + irq + soft > 0.9, "当前系统负载较高, 可能存在CPU瓶颈。"), ++ (cpu_load == 1, "系统用户态CPU利用率远大于内核态CPU利用率, 表明系统上的应用程序正在大量使用CPU资源, 是计算密集型负载场景。"), ++ (io_load == 1, "系统内核代码调用频率很高, 符合I/O密集型负载场景的特征。"), ++ (context_switch > cpu_num * 4000, f"系统每秒发生的上下文切换次数是{context_switch},已超出正常阈值上限,会对系统性能产生劣化影响。"), ++ (sys_call > cpu_num * 10000, f"每秒系统调用次数是{sys_call},表明有大量的系统调用正在发生,可能是由高负载或资源密集型应用程序引起的。"), ++ ((usr + sys) > 0.7 and sys > (0.75 * usr + 0.75 * sys), "在系统模式下的处理能力可能不足,系统可能无法有效地处理所有传入的系统调用,可能导致响应时间变长或系统性能下降。"), ++ (sys_call < 100 and util > 0.5, "系统当前有大量浮点异常(FPEs)进程。"), ++ ] ++ ++ for condition, message in conditions_and_messages: ++ cpu_info_analysis_report += self.generate_report_line(condition, message) ++ ++ # 添加阻塞进程报告 ++ cpu_info_analysis_report += f"处于阻塞状态的进程占比是{block_process}%\n" ++ ++ return cpu_info_analysis_report ++ ++ def pid_info_analysis(self) -> str: ++ pid_info_report = "基于采集的系统指标,系统进程初步的性能分析如下:\n" ++ pid_prompt = """ ++ # CONTEXT # ++ 当前有linux系统进程的数据,性能指标是在linux系统中执行 pidstat -d | head -6 获得的输出,内容如下: ++ {pid_info} ++ ++ # OBJECTIVE # ++ 请根据这些性能指标,生成一份逻辑清晰、条理清楚的系统进程的性能总结报告。 ++ 要求: ++ 1.答案中只分析可能对系统性能产生影响的指标数据。 ++ 2.答案中不要包含任何优化建议。 ++ 3.答案中尽可能保留信息中真实有效的数据。 ++ 4.答案不超过200字。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ pid_info = self.data["进程信息"] ++ pid_info_report += self.ask_llm(pid_prompt.format(pid_info=pid_info)) ++ return pid_info_report ++ ++ def generate_report( ++ self, ++ cpu_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ report_prompt = f""" ++ 以下内容是linux系统中cpu相关的性能信息: ++ {cpu_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析系统cpu的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"CPU分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +\ No newline at end of file +diff --git a/copilot-tune/src/performance_analyzer/disk_analyzer.py b/copilot-tune/src/performance_analyzer/disk_analyzer.py +new file mode 100644 +index 0000000..2fa06f5 +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/disk_analyzer.py +@@ -0,0 +1,109 @@ ++from .base_analyzer import BaseAnalyzer ++ ++class DiskAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ def analyze(self) -> str: ++ report = "基于采集的系统指标, 磁盘初步的性能分析报告如下: \n" ++ disks_info, iowait = self.data.get("磁盘信息", {})[0], self.data.get("iowait", 0) ++ report += f"系统iowait的值是{iowait}\n" ++ for disk_name, disk_info in disks_info.items(): ++ wait_time, queue_lenth, util, read_speed, write_speed, read_size, write_size = ( ++ disk_info.get("磁盘平均等待时间变化趋势", 0), ++ disk_info.get("磁盘平均请求队列长度变化趋势", 0), ++ disk_info.get("磁盘利用率", 0), ++ disk_info.get("单位时间读速率", 0), ++ disk_info.get("单位时间写速率", 0), ++ disk_info.get("单位时间读大小", 0), ++ disk_info.get("单位时间写大小", 0), ++ ) ++ report += f"磁盘{disk_name}的基本信息如下:\n" ++ report += f"磁盘利用率是{util},磁盘读速率是{read_speed},磁盘写速率是{write_speed}\n" ++ report += self.disk_info_analysis(wait_time, queue_lenth, util) ++ report += self.disk_rw_analysis(read_speed, write_speed, read_size, write_size, util) ++ return report ++ ++ def disk_info_analysis( ++ self, ++ wait_time: float, ++ queue_lenth: float, ++ util: float, ++ ) -> str: ++ disk_info_report = "" ++ queue_lenth_message = ( ++ "该磁盘设备请求队列的长度在增加,且设备利用率超过预设阈值,这可能表明该磁盘正在接近或达到其处理能力的极限" ++ if queue_lenth > 0 and util > 0.90 ++ else "" ++ ) ++ disk_info_report += self.generate_report_line(queue_lenth_message, queue_lenth_message) ++ ++ wait_time_message = ( ++ "该磁盘设备请求处理速度在下降,且设备利用率超过预设阈值, 这可能表明该磁盘正在接近或达到其处理能力的极限" ++ if wait_time > 0 and util > 0.90 ++ else "" ++ ) ++ disk_info_report += self.generate_report_line(wait_time_message, wait_time_message) ++ return disk_info_report ++ ++ def disk_rw_analysis( ++ self, ++ read_speed: float, ++ write_speed: float, ++ read_size: float, ++ write_size: float, ++ util: float, ++ ) -> str: ++ disk_rw_report = "" ++ read_size = read_size/1024 ++ write_size= write_size/1024 ++ ++ iops_message = ( ++ "该磁盘平均 Input/Ouput Operations Per Second (IOPS) 操作数超过预设限制,且设备利用率超过预设阈值, 这可能表明该磁盘正在接近或达到其处理能力的极限" ++ if read_speed + write_speed > 120 and util > 0.90 ++ else "" ++ ) ++ disk_rw_report += self.generate_report_line(iops_message, iops_message) ++ ++ size_message = ( ++ "该磁盘的平均传输速率超过预设带宽限制,且设备利用率超过预设阈值,这可能表明该磁盘正在接近或达到其处理能力的极限" ++ if read_size + write_size > 100 and util > 0.90 ++ else "" ++ ) ++ disk_rw_report += self.generate_report_line(size_message, size_message) ++ ++ return disk_rw_report ++ ++ def generate_report( ++ self, ++ disk_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ report_prompt = f""" ++ 以下内容是linux系统中磁盘相关的性能信息: ++ {disk_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析系统磁盘的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"磁盘分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +diff --git a/copilot-tune/src/performance_analyzer/memory_analyzer.py b/copilot-tune/src/performance_analyzer/memory_analyzer.py +new file mode 100644 +index 0000000..aede6d7 +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/memory_analyzer.py +@@ -0,0 +1,72 @@ ++from .base_analyzer import BaseAnalyzer ++ ++class MemoryAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ def analyze(self) -> str: ++ report = "基于采集的系统指标,内存初步的性能分析如下:\n" ++ swapout, oom_kill, swap_ratio, util = ( ++ self.data.get("swapout", 0), ++ self.data.get("omm_kill", 0), ++ self.data.get("交换空间使用率", 0), ++ self.data.get("内存使用率", 0) ++ ) ++ swap_ratio = 1 - swap_ratio ++ report += f"系统内存使用率是{util}\n" ++ report += self.omm_kill_analysis(oom_kill) ++ report += self.swap_analysis(swap_ratio) ++ report += self.swapout_analysis(swapout) ++ return report ++ ++ def omm_kill_analysis( ++ self, ++ oom_kill: float, ++) -> str: ++ return self.generate_report_line(oom_kill == 1, "系统近期发生过oom_kill行为, 即内存严重不足,需要杀死进程来释放内存") ++ ++ def swap_analysis( ++ self, ++ swap_low: float, ++ ) -> str: ++ return self.generate_report_line(swap_low < 0.1, f"系统可用交换空间百分比为{swap_low},低于预设阈值, 系统可能很快会耗尽虚拟内存,需要减少运行程序的数量和大小或增加交换空间来避免完全耗尽") ++ ++ def swapout_analysis( ++ self, ++ swapout: float, ++ ) -> str: ++ return self.generate_report_line(swapout == 1, "系统持续以高速率将页面交换到交换空间,这表明系统物理可能内存不足") ++ ++ def generate_report( ++ self, ++ memory_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ report_prompt = f""" ++ 以下内容是linux系统中内存相关的性能信息: ++ {memory_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析系统内存的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"内存分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +\ No newline at end of file +diff --git a/copilot-tune/src/performance_analyzer/micro_dep_analyzer.py b/copilot-tune/src/performance_analyzer/micro_dep_analyzer.py +new file mode 100644 +index 0000000..a4619d5 +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/micro_dep_analyzer.py +@@ -0,0 +1,95 @@ ++from .base_analyzer import BaseAnalyzer ++import logging ++class MicroDepAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ self.prompt_dict = { ++ "frontend_bound": "TopDown中的前端瓶颈(frontend bound)", ++ "bad_spec": "TopDown中的预测失败瓶颈(bad speculation)", ++ "retiring": "TopDown中的指令完成(retiring)", ++ "backend_bound": "TopDown中的后端瓶颈(backend bound)", ++ "frontend_latency_bound": "TopDown中的前端瓶颈下的前端延时瓶颈(frontend latency bound)", ++ "frontend_bandwidth_bound": "TopDown中的前端瓶颈下的前端带宽瓶颈(frontend bandwidth bound)", ++ "bs_mispred": "TopDown中的预测失败瓶颈中的分支预测失败瓶颈(bad speculation branch misprediction)", ++ "bs_mclear": "TopDown中的预测失败瓶颈中的流水线清空瓶颈(bad speculation machine clears)", ++ "core_bound": "TopDown中的后端瓶颈中的后端执行瓶颈(core bound)", ++ "mem_bound": "TopDown中的后端瓶颈中的后端内存子系统瓶颈(memory bound)", ++ "core_fsu_bound": "TopDown中的后端执行瓶颈中的浮点/向量计算瓶颈(core fsu bound)", ++ "core_other_bound": "TopDown中的后端执行瓶颈中的后端其他执行瓶颈(core other bound)", ++ "mem_l1_bound": "TopDown中的后端内存子系统瓶颈中的读取L1 cache造成的指令执行瓶颈(不包含L2/L3)", ++ "mem_l2_bound": "TopDown中的后端内存子系统瓶颈中的读取L2 cache造成的指令执行瓶颈(不包含L1/L3)", ++ "mem_l3_dram_bound": "TopDown中的后端内存子系统瓶颈中的读取L3以及内存造成的指令执行瓶颈(不包含L1/L2)", ++ "mem_store_bound": "TopDown中的后端内存子系统瓶颈中的内存写瓶颈(memory store bound)", ++ "context_switches": "上下文切换次数(context-switches)", ++ "cpu_migrations": "进程在不同CPU核之间的迁移次数(cpu-migrations)", ++ "page_faults": "缺页异常次数(page-faults)", ++ "l1i_missrate": "L1指令miss rate", ++ "l1d_missrate": "L1数据miss rate", ++ "l2i_missrate": "L2指令miss rate", ++ "l2d_missrate": "L2数据miss rate", ++ "l1i_mpki": "L1指令每千条指令中miss次数", ++ "l1d_mpki": "L1数据每千条指令中miss次数", ++ "l2i_mpki": "L2指令每千条指令中miss次数", ++ "l2d_mpki": "L2数据每千条指令中miss次数", ++ "branch_missrate": "分支预测失败率(branch missrate)", ++ "alu_isq_stall": "算术逻辑单元全部被占用导致的执行瓶颈", ++ "lsu_isq_stall": "访存逻辑单元全部被占用导致的执行瓶颈", ++ "fsu_isq_stall": "浮点单元全部被占用导致的执行瓶颈", ++ "l1i_tlb_missrate": "L1指令快表miss rate(l1i_tlb_missrate)", ++ "l1d_tlb_missrate": "L1数据快表miss rate(l1d_tlb_missrate)", ++ "l2i_tlb_missrate": "L2指令快表miss rate(l2i_tlb_missrate)", ++ "l2d_tlb_missrate": "L2数据快表miss rate(l2d_tlb_missrate)", ++ "itlb_walk_rate": "指令页表缓存未命中时触发页表遍历的频率(itlb_walk_rate)", ++ "dtlb_walk_rate": "数据页表缓存未命中时触发页表遍历的频率(dtlb_walk_rate)", ++ "l1i_tlb_mpki": "L1指令TLB每千条指令中miss次数", ++ "l1d_tlb_mpki": "L1数据TLB每千条指令中miss次数", ++ "l2i_tlb_mpki": "L2指令TLB每千条指令中miss次数", ++ "l2d_tlb_mpki": "L2数据TLB每千条指令中miss次数", ++ "itlb_walk_mpki": "指令TLB每千条指令中到页表查找次数", ++ "dtlb_walk_mpki": "指令TLB每千条指令中到页表查找次数", ++ "div_stall": "除法指令在关键路径导致的执行瓶颈", ++ } ++ def analyze(self) -> str: ++ report = "基于采集的系统指标, 微架构初步的性能分析报告如下: \n" ++ processed_data_dict = {} ++ for k, v in self.data.items(): ++ if k in self.prompt_dict.keys(): ++ processed_data_dict[self.prompt_dict[k]] = v ++ else: ++ logging.warning("Cannot find prompt for item {k}") ++ report += f"系统微架构状态是{processed_data_dict}\n" ++ return report ++ ++ def generate_report( ++ self, ++ micro_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ report_prompt = f""" ++ 以下内容是linux系统中应用微架构相关的性能信息: ++ {micro_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析系统应用微架构的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"应用微架构分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +diff --git a/copilot-tune/src/performance_analyzer/network_analyzer.py b/copilot-tune/src/performance_analyzer/network_analyzer.py +new file mode 100644 +index 0000000..729447c +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/network_analyzer.py +@@ -0,0 +1,103 @@ ++from .base_analyzer import BaseAnalyzer ++ ++class NetworkAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ def analyze(self) -> str: ++ report = "基于采集的系统指标, 网络初步的性能分析如下:\n" ++ listenOverflows, fulldocookies, fulldrop, network_adapter = ( ++ self.data.get("listenOverflows", 0), ++ self.data.get("fulldocookies", 0), ++ self.data.get("fulldrop", 0), ++ self.data.get("网卡指标", "") ++ ) ++ report += self.listenOverflows_analysis(listenOverflows) ++ report += self.fulldocookies_analysis(fulldocookies) ++ report += self.fulldrop_analysis(fulldrop) ++ report += self.network_adapter_analysis(network_adapter) ++ return report ++ ++ def listenOverflows_analysis( ++ self, ++ listenOverflows: float ++ ) -> str: ++ return self.generate_report_line(listenOverflows == 1, "系统存在因为监听队列回滚而丢弃TCP连接的现象。这通常表明系统无法及时处理传入的连接请求, 导致连接被系统自动丢弃") ++ ++ def fulldrop_analysis( ++ self, ++ fulldrop: float ++ ) -> str: ++ return self.generate_report_line(fulldrop == 1, "系统存在因为TCP请求队列满了而丢弃新的连接请求的现象。这通常表明系统无法及时处理传入的连接请求, 导致内核自动丢弃这些请求") ++ ++ def fulldocookies_analysis( ++ self, ++ fulldocookies: float ++ ) -> str: ++ return self.generate_report_line(fulldocookies == 1, "系统存在因为TCP请求队列满了而发送SYN COOKIE的现象。这通常表明系统无法及时处理传入的连接请求, 导致内核自动采取措施来处理这些请求, 例如发送SYN COOKIE") ++ ++ def network_adapter_analysis( ++ self, ++ network_adapter: str ++ ) -> str: ++ network_adapter_prompt = f""" ++ # CONTEXT # ++ 当前有linux系统网卡的数据,性能指标是在linux系统中执行 netstat -i 获得的输出,内容如下: ++ {network_adapter} ++ ++ # OBJECTIVE # ++ 请根据这些性能指标,生成一份逻辑清晰、条理清楚的系统网卡的性能总结报告。 ++ 要求: ++ 1.答案中只分析可能对系统性能产生影响的指标数据。 ++ 2.答案中不要包含任何优化建议。 ++ 3.答案中尽可能保留信息中真实有效的数据。 ++ 4.答案不超过200字。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(network_adapter_prompt) ++ ++ def generate_report( ++ self, ++ network_report: str ++ ) -> str: ++ # TO DO ++ # 要有一个报告模板,指明包含哪些信息,以及报告格式 ++ report_prompt = f""" ++ 以下内容是linux系统中网络传输相关的性能信息: ++ {network_report} ++ 信息中所涉及到的数据准确无误,真实可信。 ++ ++ # OBJECTIVE # ++ 请根据上述信息,分析系统网络传输的性能状况。 ++ 要求: ++ 1.答案中不要包含任何优化建议。 ++ 2.答案中尽可能保留信息中真实有效的数据。 ++ 3.不要遗漏任何值得分析的信息。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰,让你的回答真实可信 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 回答以"网络分析如下:"开头,然后另起一行逐条分析。 ++ 如果有多条分析结论,请用数字编号分点作答。 ++ ++ """ ++ return self.ask_llm(report_prompt) + "\n" +\ No newline at end of file +diff --git a/copilot-tune/src/performance_analyzer/performance_analyzer.py b/copilot-tune/src/performance_analyzer/performance_analyzer.py +new file mode 100644 +index 0000000..1b55851 +--- /dev/null ++++ b/copilot-tune/src/performance_analyzer/performance_analyzer.py +@@ -0,0 +1,111 @@ ++from .cpu_analyzer import CpuAnalyzer ++from .disk_analyzer import DiskAnalyzer ++from .memory_analyzer import MemoryAnalyzer ++from .network_analyzer import NetworkAnalyzer ++from .app_analyzer import AppAnalyzer ++from .micro_dep_analyzer import MicroDepAnalyzer ++from .base_analyzer import BaseAnalyzer ++from typing import Tuple ++from src.utils.thread_pool import ThreadPoolManager ++ ++ ++class PerformanceAnalyzer(BaseAnalyzer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ self.cpu_analyzer = CpuAnalyzer( ++ app=kwargs["app"], data=self.data.get("Cpu", {}) ++ ) ++ self.disk_analyzer = DiskAnalyzer( ++ app=kwargs["app"], data=self.data.get("Disk", {}) ++ ) ++ self.memory_analyzer = MemoryAnalyzer( ++ app=kwargs["app"], data=self.data.get("Memory", {}) ++ ) ++ self.network_analyzer = NetworkAnalyzer( ++ app=kwargs["app"], data=self.data.get("Network", {}) ++ ) ++ self.micro_analyer = MicroDepAnalyzer( ++ app=kwargs["app"], data=self.data.get("micro_dep", {}) ++ ) ++ self.app_analyzer = AppAnalyzer( ++ app=kwargs["app"], data=self.data.get("Application", {}) ++ ) ++ self.thread_pool = ThreadPoolManager(max_workers=5) ++ ++ def analyze(self, report: str) -> str: ++ bottle_neck_prompt = f""" ++ # CONTEXT # ++ 当前linux系统的性能分析报告如下,报告中所涉及到的数据准确无误,真实可信: ++ {report} ++ ++ # OBJECTIVE # ++ 请根据系统性能分析报告,确定当前系统是否存在性能瓶颈;如果存在性能瓶颈,则该瓶颈主要是存在于系统的哪个方面。 ++ 你应该依据多条信息和多个指标的数据进行综合判断,不要基于单点信息轻易下结论,你最终的结论应该能找到多个佐证。 ++ 要求: ++ 1.你必须从[CPU,NETWORK,DISK,MEMORY,NONE]这五个选项中选择一项作为你的答案。 ++ 2.不要回答多余的文字,你的答案必须严格和上述选项描述一致。 ++ 3.如果你认为没有性能瓶颈,请选择NONE。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你只用回答上述五个选项之一 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请认真思考后给出你的答案。 ++ ++ # RESPONSE FORMAT # ++ 请直接回答五个选项之一,不要包含多余文字 ++ ++ """ ++ result = self.ask_llm(bottle_neck_prompt) ++ bottlenecks = { ++ "cpu": "CPU", ++ "disk": "DISK", ++ "network": "NETWORK", ++ "memory": "MEMORY", ++ "none": "NONE", ++ } ++ ++ # 转换为小写并查找瓶颈 ++ for key, value in bottlenecks.items(): ++ if key in result.lower(): ++ return value ++ ++ # 如果没有找到明确的瓶颈,返回UNKNOWN BOTTLENECKS ++ return "UNKNOWN BOTTLENECKS" ++ ++ def generate_report(self) -> Tuple[str, str]: ++ cpu_analyzer_task = self.thread_pool.add_task(self.cpu_analyzer.run) ++ disk_analyzer_task = self.thread_pool.add_task(self.disk_analyzer.run) ++ memory_analyzer_task = self.thread_pool.add_task(self.memory_analyzer.run) ++ network_analyzer_task = self.thread_pool.add_task(self.network_analyzer.run) ++ micro_analyzer_task = self.thread_pool.add_task(self.micro_analyer.run) ++ app_analyzer_task = self.thread_pool.add_task(self.app_analyzer.run) ++ ++ self.thread_pool.run_all_tasks() ++ task_results = self.thread_pool.get_all_results() ++ ++ report_results = {} ++ for task_result in task_results: ++ if task_result.status_code != 0: ++ raise RuntimeError( ++ f"failed to execute task {task_result.func_name}, exception is {task_result.result}" ++ ) ++ report_results[task_result.uuid] = task_result.result ++ ++ os_performance_report = "" ++ os_performance_report += report_results[cpu_analyzer_task] ++ os_performance_report += report_results[disk_analyzer_task] ++ os_performance_report += report_results[memory_analyzer_task] ++ os_performance_report += report_results[network_analyzer_task] ++ os_performance_report += report_results[micro_analyzer_task] ++ app_performance_report = "" ++ app_performance_report += report_results[app_analyzer_task] ++ return os_performance_report, app_performance_report ++ ++ def run(self) -> Tuple[str, str]: ++ os_performance_report, app_performance_report = self.generate_report() ++ bottleneck = self.analyze(os_performance_report) ++ return os_performance_report + app_performance_report, bottleneck +diff --git a/copilot-tune/src/performance_collector/__init__.py b/copilot-tune/src/performance_collector/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/performance_collector/app_collector.py b/copilot-tune/src/performance_collector/app_collector.py +new file mode 100644 +index 0000000..31886a2 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/app_collector.py +@@ -0,0 +1,45 @@ ++import importlib ++import logging ++ ++from src.utils.manager.task_manager import TaskManager ++from src.utils.shell_execute import SshClient ++ ++ ++def load_app_collector(app: str): ++ # 构建模块路径和类名 ++ module_path = f".application.{app.lower()}_collector" ++ try: ++ # 动态导入模块(当前模块是包内的,使用相对导入) ++ module = importlib.import_module(module_path, package=__package__) ++ ++ return module ++ except (ImportError, AttributeError) as e: ++ logging.error( ++ f"no module named {module_path} can be found, will skip collect application workload data." ++ ) ++ ++ return None ++ ++ ++class AppCollector: ++ def __init__( ++ self, ++ ssh_client: SshClient, ++ app: str = None, ++ ): ++ self.app = app ++ app_collector_module = load_app_collector(self.app) ++ ++ if not app_collector_module: ++ self.collector = None ++ else: ++ self.collector = TaskManager( ++ ssh_client=ssh_client, ++ modules=[app_collector_module], ++ timeout=60, ++ ) ++ ++ def run(self): ++ if not self.collector: ++ return {} ++ return self.collector.run() +diff --git a/copilot-tune/src/performance_collector/application/ceph_collector.py b/copilot-tune/src/performance_collector/application/ceph_collector.py +new file mode 100644 +index 0000000..6a1b62a +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/ceph_collector.py +@@ -0,0 +1,165 @@ ++import json ++import re ++ ++from src.utils.collector.metric_collector import ( ++ snapshot_task, ++ CollectMode, ++) ++ ++ ++@snapshot_task( ++ cmd="ceph -s", ++ collect_mode=CollectMode.ASYNC, ++ tag="ceph集群状态信息", ++) ++def parse_ceph_s(output: str) -> dict: ++ result = {} ++ ++ # degraded objects ++ degraded_match = re.search(r"(\d+)\s+degraded objects", output) ++ if degraded_match: ++ result["降级对象数"] = int(degraded_match.group(1)) ++ ++ # recovery speed ++ recovery_match = re.search(r"recovery io.*?([\d\.]+)\s*([KMGT]?B)/s", output) ++ if recovery_match: ++ result["恢复速度"] = recovery_match.group(1) + recovery_match.group(2) ++ ++ # slow ops ++ slow_ops_match = re.search(r"(\d+)\s+slow ops", output) ++ if slow_ops_match: ++ result["慢请求数"] = int(slow_ops_match.group(1)) ++ ++ return {"ceph -s": result} ++ ++ ++@snapshot_task( ++ cmd="ceph df", ++ collect_mode=CollectMode.ASYNC, ++ tag="ceph磁盘使用情况", ++) ++def parse_ceph_df_output(text: str) -> dict: ++ result = { ++ "总容量": None, ++ "已用容量": None, ++ "可用容量": None, ++ "使用率(%)": None, ++ "存储类型": [], ++ "存储池": [] ++ } ++ ++ lines = text.strip().splitlines() ++ section = None ++ ++ for line in lines: ++ line = line.strip() ++ if line.startswith('--- RAW STORAGE'): ++ section = 'raw' ++ continue ++ elif line.startswith('--- POOLS'): ++ section = 'pools' ++ continue ++ elif not line or line.startswith('CLASS') or line.startswith('POOL'): ++ continue ++ ++ parts = re.split(r'\s{2,}|\t', line) ++ ++ if section == 'raw' and len(parts) >= 6: ++ storage = { ++ "类型": parts[0], ++ "总容量": parts[1], ++ "可用容量": parts[2], ++ "已用容量": parts[3], ++ "原始已用容量": parts[4], ++ "原始使用率(%)": float(parts[5]) ++ } ++ result["存储类型"].append(storage) ++ ++ if parts[0] == "TOTAL": ++ result["总容量"] = parts[1] ++ result["可用容量"] = parts[2] ++ result["已用容量"] = parts[3] ++ result["使用率(%)"] = float(parts[5]) ++ ++ elif section == 'pools' and len(parts) >= 7: ++ pool = { ++ "名称": parts[0], ++ "ID": int(parts[1]), ++ "PG数量": int(parts[2]), ++ "存储量": parts[3], ++ "对象数": int(parts[4]), ++ "已用容量": parts[5], ++ "使用率(%)": float(parts[6]), ++ "最大可用容量": parts[7] if len(parts) > 7 else None ++ } ++ result["存储池"].append(pool) ++ ++ return {"ceph df": result} ++ ++ ++@snapshot_task( ++ cmd="ceph pg stat", ++ collect_mode=CollectMode.ASYNC, ++ tag="ceph PG(Placement Groups,数据放置组)的详细状态统计", ++) ++def parse_ceph_pg_stat(output: str) -> dict: ++ result = {} ++ pg_match = re.search(r"(\d+)\s+active.+", output) ++ if pg_match: ++ result["PG 总数"] = int(pg_match.group(1)) ++ return {"ceph pg stat": result} ++ ++ ++@snapshot_task( ++ cmd="ceph tell osd.* perf dump", ++ collect_mode=CollectMode.ASYNC, ++ tag="所有 OSD 的性能统计数据,包含操作延迟、IOPS、吞吐等指标", ++) ++def parse_perf_dump_str(raw_str: str) -> dict: ++ def get_value_by_path(d, path): ++ keys = path.split('.') ++ cur = d ++ for k in keys: ++ if not isinstance(cur, dict): ++ return None ++ cur = cur.get(k) ++ if cur is None: ++ return None ++ return cur ++ ++ path_map = { ++ "osd.op_r": "读请求数", ++ "osd.op_w": "写请求数", ++ "osd.op_latency.avgcount": "总体请求数", ++ "osd.op_latency.avgtime": "平均请求延迟(s)", ++ "osd.op_r.avgtime": "平均读延迟(s)", ++ "osd.op_w.avgtime": "平均写延迟(s)", ++ "bluefs.db_write_bytes": "BlueFS写入字节数", ++ "bluefs.wal_write_bytes": "BlueFS WAL写入字节数", ++ "filestore.journal_latency.avgcount": "Journal请求数", ++ "filestore.journal_latency.avgtime": "Journal延迟平均(s)", ++ "bluestore.kv_commit_lat.avgtime": "KV提交延迟(s)" ++ } ++ ++ # 解析多osd json字符串 ++ pattern = re.compile(r'(osd\.\d+):\s*({.*?})(?=(?:\nosd\.\d+:)|\Z)', re.S) ++ result = {} ++ ++ for match in pattern.finditer(raw_str): ++ osd_name = match.group(1) ++ json_str = match.group(2) ++ ++ try: ++ perf_data = json.loads(json_str) ++ except json.JSONDecodeError: ++ perf_data = {} ++ ++ metrics = {} ++ for eng_path, cn_name in path_map.items(): ++ val = get_value_by_path(perf_data, eng_path) ++ if val is not None: ++ metrics[cn_name] = val ++ ++ result[osd_name] = metrics ++ ++ return {"ceph tell osd.* perf dump": result} +diff --git a/copilot-tune/src/performance_collector/application/flink_collector.py b/copilot-tune/src/performance_collector/application/flink_collector.py +new file mode 100644 +index 0000000..ce0b8df +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/flink_collector.py +@@ -0,0 +1,269 @@ ++import json ++import logging ++ ++from src.config import config ++from src.utils.collector.metric_collector import snapshot_task, CollectMode ++ ++FLINK_HOST = config["servers"][0]["ip"] ++FLINK_API = f"http://{FLINK_HOST}:8081" ++ ++logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") ++ ++ ++@snapshot_task( ++ cmd=( ++ f"curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' | xargs -I{{}} curl -s {FLINK_API}/jobs/{{}}" ++ ), ++ tag="flink作业详情", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_job_detail(output: str) -> dict: ++ try: ++ job_detail = json.loads(output) ++ vertices = job_detail.get("vertices", []) ++ ++ total_tasks = 0 ++ running_tasks = 0 ++ failed_tasks = 0 ++ parallelisms = [] ++ ++ for v in vertices: ++ p = v.get("parallelism", 0) ++ status = v.get("status", "") ++ total_tasks += p ++ parallelisms.append(p) ++ if status == "RUNNING": ++ running_tasks += 1 ++ if status == "FAILED": ++ failed_tasks += 1 ++ ++ avg_parallelism = round(sum(parallelisms) / len(parallelisms), 2) if parallelisms else 0 ++ max_parallelism = max(parallelisms) if parallelisms else 0 ++ ++ return { ++ f"curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' ": ++ { ++ "任务总数": total_tasks, ++ "运行中任务数": running_tasks, ++ "失败任务数": failed_tasks, ++ "平均并行度": avg_parallelism, ++ "最大并行度": max_parallelism, ++ } ++ } ++ ++ except Exception as e: ++ logging.warning(f"解析 flink job detail 失败: {e}") ++ return {} ++ ++ ++@snapshot_task( ++ cmd=( ++ "curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' | xargs -I{{}} curl -s {FLINK_API}/jobs/{{}}/checkpoints" ++ ), ++ tag="flink checkpoint状态", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_checkpoint_status(output: str) -> dict: ++ try: ++ ckpt_info = json.loads(output) ++ ++ # 从 counts 字段获取失败次数 ++ failed_count = ckpt_info.get("counts", {}).get("failed", 0) ++ ++ # 获取最近一次完成的检查点信息 (注意:可能是 None/null) ++ latest_completed = ckpt_info.get("latest", {}).get("completed") ++ # 只有当 latest_completed 存在(不为 None)时才提取其数据 ++ if latest_completed is not None: ++ latest_duration = latest_completed.get("duration", 0) ++ latest_state_size = latest_completed.get("state_size", 0) ++ else: ++ # 如果没有完成的检查点,设置默认值 ++ latest_duration = 0 ++ latest_state_size = 0 ++ ++ return { ++ f"curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' ": ++ { ++ "最近一次Checkpoint耗时(ms)": latest_duration, ++ "最近一次Checkpoint状态大小(bytes)": latest_state_size, ++ "Checkpoint失败次数": failed_count, ++ } ++ } ++ except Exception as e: ++ logging.warning(f"解析 flink checkpoint 信息失败: {e}") ++ return {} ++ ++ ++@snapshot_task( ++ cmd=f"curl -s {FLINK_API}/jobs/overview", ++ tag="flink作业总览", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_job_overview(output: str) -> dict: ++ try: ++ data = json.loads(output) ++ jobs = data.get("jobs", []) ++ return { ++ f"curl -s {FLINK_API}/jobs/overview": ++ { ++ "作业总数": len(jobs), ++ "运行中作业数": sum(1 for j in jobs if j.get("state") == "RUNNING"), ++ "失败作业数": sum(1 for j in jobs if j.get("state") == "FAILED"), ++ } ++ } ++ except Exception as e: ++ logging.warning(f"解析 flink jobs overview 失败: {e}") ++ return {} ++ ++ ++@snapshot_task( ++ cmd=f"curl -s {FLINK_API}/taskmanagers", ++ tag="flink资源使用", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_resource_usage(output: str) -> dict: ++ try: ++ data = json.loads(output) ++ tms = data.get("taskmanagers", []) ++ ++ total_slots = sum(tm.get("slotsNumber", 0) for tm in tms) ++ available_slots = sum(tm.get("slotsAvailable", 0) for tm in tms) ++ total_heap = sum(tm.get("heapUsed", 0) for tm in tms) ++ total_managed = sum(tm.get("managedMemoryUsed", 0) for tm in tms) ++ ++ return { ++ "flink资源使用": ++ { ++ "TaskManager数量": len(tms), ++ "总Slots数": total_slots, ++ "空闲Slots数": available_slots, ++ "Heap内存使用(MB)": round(total_heap / 1024 / 1024, 2), ++ "Managed内存使用(MB)": round(total_managed / 1024 / 1024, 2), ++ } ++ } ++ except Exception as e: ++ logging.warning(f"解析 flink taskmanagers 信息失败: {e}") ++ return {} ++ ++ ++@snapshot_task( ++ cmd=( ++ f"curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' | xargs -I{{}} curl -s {FLINK_API}/jobs/{{}}/backpressure" ++ ), ++ tag="flink反压指标", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_backpressure(output: str) -> dict: ++ try: ++ bp = json.loads(output) ++ levels = bp.get("backpressure-levels", []) ++ blocked = sum(1 for v in levels if v.get("backpressure-level") == "BLOCKED") ++ ratio = f"{(blocked / len(levels)) * 100:.2f}%" if levels else "0%" ++ return { ++ f"curl -s {FLINK_API}/taskmanagers": ++ { ++ "阻塞算子数量": blocked, ++ "Backpressure阻塞率": ratio ++ } ++ } ++ except Exception as e: ++ logging.warning(f"解析 flink backpressure 失败: {e}") ++ return {} ++ ++ ++# 核心吞吐量指标采集(确保返回所有关键指标) ++@snapshot_task( ++ cmd=( ++ f"curl -s {FLINK_API}/jobs | jq -r '.jobs[0].id' | xargs -I{{}} curl -s " ++ f"{FLINK_API}/jobs/{{}}/metrics?get=" ++ "numRecordsInPerSecond," ++ "numRecordsOutPerSecond," ++ "numBytesInPerSecond," ++ "numBytesOutPerSecond," ++ "latency," ++ "lastCheckpointSize" ++ ), ++ tag="flink_throughput_metrics", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_throughput_metrics(output: str) -> dict: ++ """采集核心吞吐量指标并确保所有字段都有值""" ++ try: ++ metrics = json.loads(output) ++ result = { ++ "输入吞吐": 0.0, ++ "输出吞吐": 0.0, ++ "输入数据量": 0.0, ++ "输出数据量": 0.0, ++ "延迟指标": 0.0, ++ "检查点大小": 0 ++ } ++ ++ for metric in metrics: ++ metric_id = metric["id"] ++ value = metric["value"] ++ ++ if "numRecordsInPerSecond" in metric_id: ++ result["输入吞吐"] = float(value) ++ elif "numRecordsOutPerSecond" in metric_id: ++ result["输出吞吐"] = float(value) ++ elif "numBytesInPerSecond" in metric_id: ++ result["输入数据量"] = float(value) ++ elif "numBytesOutPerSecond" in metric_id: ++ result["输出数据量"] = float(value) ++ elif "latency" in metric_id and "p99" in metric_id: ++ result["延迟指标"] = float(value) ++ elif "lastCheckpointSize" in metric_id: ++ result["检查点大小"] = int(value) ++ ++ return {"flink_throughput_metrics": result} ++ except Exception as e: ++ logging.error(f"解析吞吐量指标失败: {e}") ++ # 返回默认值而不是空字典 ++ return {"flink_throughput_metrics": { ++ "输入吞吐": 0.0, ++ "输出吞吐": 0.0, ++ "输入数据量": 0.0, ++ "输出数据量": 0.0, ++ "延迟指标": 0.0, ++ "检查点大小": 0 ++ }} ++ ++ ++# 资源使用指标采集 ++@snapshot_task( ++ cmd=( ++ f"curl -s {FLINK_API}/taskmanagers | jq -r '.taskmanagers[0].id' | " ++ f"xargs -I{{}} curl -s {FLINK_API}/taskmanagers/{{}}/metrics?get=" ++ "Status.JVM.CPU.Load," ++ "Status.JVM.Memory.Heap.Used" ++ ), ++ tag="flink_resource_usage", ++ collect_mode=CollectMode.ASYNC ++) ++def flink_resource_usage(output: str) -> dict: ++ """采集资源使用指标并确保返回有效数据""" ++ try: ++ metrics = json.loads(output) ++ resource_data = { ++ "CPU负载": 0.0, ++ "堆内存使用": 0 ++ } ++ ++ for metric in metrics: ++ metric_id = metric["id"] ++ value = metric["value"] ++ ++ if "CPU.Load" in metric_id: ++ resource_data["CPU负载"] = float(value) * 100 # 转换为百分比 ++ elif "Heap.Used" in metric_id: ++ resource_data["堆内存使用"] = int(value) ++ ++ return {"flink_resource_usage": resource_data} ++ except Exception as e: ++ logging.error(f"解析资源指标失败: {e}") ++ # 返回默认值而不是空字典 ++ return {"flink_resource_usage": { ++ "CPU负载": 0.0, ++ "堆内存使用": 0 ++ }} +diff --git a/copilot-tune/src/performance_collector/application/gaussdb_collector.py b/copilot-tune/src/performance_collector/application/gaussdb_collector.py +new file mode 100644 +index 0000000..7210719 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/gaussdb_collector.py +@@ -0,0 +1,206 @@ ++import logging ++from io import StringIO ++ ++import pandas as pd ++ ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++) ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++GAUSS_INTERVAL = 180 ++ ++ ++# -------------------- 1. 后台写入与检查点(两次采样) -------------------- ++@period_task( ++ cmd='gsql -d tpch -A -F , -c "SELECT * FROM pg_stat_bgwriter;"', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB后台写入与检查点", ++ delay=0, ++ sample_count=2, ++ interval=GAUSS_INTERVAL, ++) ++def gauss_bgwriter_parser(output: list[str]) -> dict: ++ if len(output) < 2: ++ return {} ++ df1 = pd.read_csv(StringIO(output[0])) ++ df2 = pd.read_csv(StringIO(output[1])) ++ if df1.empty or df2.empty: ++ return {} ++ ++ r1, r2 = df1.iloc[0].to_dict(), df2.iloc[0].to_dict() ++ mapping = { ++ "checkpoints_timed": "定时检查点", ++ "checkpoints_req": "请求检查点", ++ "checkpoint_write_time": "检查点写入耗时(ms)", ++ "checkpoint_sync_time": "检查点同步耗时(ms)", ++ "buffers_checkpoint": "检查点写出页数", ++ "buffers_clean": "后台清理写出页数", ++ "maxwritten_clean": "后台清理超限次数", ++ "buffers_backend": "后端写出页数", ++ "buffers_backend_fsync": "后端 fsync 次数", ++ "buffers_alloc": "分配新缓冲区页数", ++ } ++ result = {} ++ for key, label in mapping.items(): ++ try: ++ delta = int(r2.get(key, 0)) - int(r1.get(key, 0)) ++ except (ValueError, TypeError): ++ delta = 0 ++ result[f"{GAUSS_INTERVAL // 60}分钟内{label}"] = max(delta, 0) ++ cmd = '''gsql -d tpch -A -F , -c "SELECT * FROM pg_stat_bgwriter;"''' ++ return {cmd: result} ++ ++ ++# -------------------- 2. 事务与IO(两次采样) -------------------- ++@period_task( ++ cmd='''gsql -d tpch -A -F , -c " ++ SELECT sum(xact_commit) as commits, ++ sum(xact_rollback) as rollbacks, ++ sum(blks_read) as blks_read, ++ sum(blks_hit) as blks_hit, ++ sum(tup_returned) as tup_returned, ++ sum(tup_fetched) as tup_fetched ++ FROM pg_stat_database;"''', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB事务与IO", ++ delay=0, ++ sample_count=2, ++ interval=GAUSS_INTERVAL, ++) ++def gauss_dbstat_parser(output: list[str]) -> dict: ++ if len(output) < 2: ++ return {} ++ df1 = pd.read_csv(StringIO(output[0])) ++ df2 = pd.read_csv(StringIO(output[1])) ++ if df1.empty or df2.empty: ++ return {} ++ ++ r1, r2 = df1.iloc[0].to_dict(), df2.iloc[0].to_dict() ++ result = {} ++ for col in ("commits", "rollbacks", "blks_read", "blks_hit", "tup_returned", "tup_fetched"): ++ try: ++ delta = int(r2[col]) - int(r1[col]) ++ except (ValueError, TypeError): ++ delta = 0 ++ result[f"{GAUSS_INTERVAL // 60}分钟内{col}"] = max(delta, 0) ++ ++ # 计算命中率 ++ hit_delta = result[f"{GAUSS_INTERVAL // 60}分钟内blks_hit"] ++ read_delta = result[f"{GAUSS_INTERVAL // 60}分钟内blks_read"] ++ result[f"{GAUSS_INTERVAL // 60}分钟内Buffer命中率"] = ( ++ round(hit_delta * 100 / (hit_delta + read_delta), 2) if (hit_delta + read_delta) else 0 ++ ) ++ cmd = '''gsql -d tpch -A -F , -c " ++ SELECT sum(xact_commit) as commits, ++ sum(xact_rollback) as rollbacks, ++ sum(blks_read) as blks_read, ++ sum(blks_hit) as blks_hit, ++ sum(tup_returned) as tup_returned, ++ sum(tup_fetched) as tup_fetched ++ FROM pg_stat_database;"''' ++ return {cmd: result} ++ ++ ++# -------------------- 3. 会话信息(实时快照) -------------------- ++@snapshot_task( ++ cmd='''gsql -d tpch -A -F , -c " ++SELECT datname, state, waiting, enqueue ++FROM pg_stat_activity;"''', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB会话信息", ++) ++def gauss_activity_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = { ++ "datname": "数据库名", ++ "state": "连接状态", ++ "waiting": "是否等待", ++ "enqueue": "排队/锁信息", ++ } ++ cmd = '''gsql -d tpch -A -F , -c "SELECT datname, state, waiting, enqueueFROM pg_stat_activity;"''' ++ result = [ ++ {mapping.get(k, k): v for k, v in row.items()} ++ for _, row in df.iterrows() ++ ] ++ return {cmd: result} ++ ++ ++# -------------------- 4. 锁信息(实时快照) -------------------- ++@snapshot_task( ++ cmd='''gsql -d tpch -A -F , -c "SELECT mode, granted, COUNT(*) AS count FROM pg_locks GROUP BY mode, granted;"''', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB锁信息", ++) ++def gauss_locks_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = {"mode": "锁模式", "granted": "是否已授予", "count": "锁数量"} ++ cmd = '''gsql -d tpch -A -F , -c "SELECT mode, granted, COUNT(*) AS count FROM pg_locks GROUP BY mode, granted;"''' ++ result = [ ++ {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() ++ ] ++ return {cmd: result} ++ ++ ++# -------------------- 5. 数据库级统计(实时快照) -------------------- ++@snapshot_task( ++ cmd='''gsql -d tpch -A -F , -c "SELECT datname, numbackends, xact_commit, xact_rollback, ++ blks_read, blks_hit, pg_database_size(datname) AS db_size_bytes ++ FROM pg_stat_database WHERE datname NOT IN ('template0', 'template1');"''', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB数据库级指标", ++) ++def gauss_database_snapshot_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = { ++ "datname": "数据库名", ++ "numbackends": "连接数", ++ "xact_commit": "提交事务数", ++ "xact_rollback": "回滚事务数", ++ "blks_read": "磁盘读块数", ++ "blks_hit": "缓冲命中块数", ++ "db_size_bytes": "数据库大小(Bytes)", ++ } ++ cmd = '''gsql -d tpch -A -F , -c "SELECT datname, numbackends, xact_commit, xact_rollback, ++ blks_read, blks_hit, pg_database_size(datname) AS db_size_bytes ++ FROM pg_stat_database WHERE datname NOT IN ('template0', 'template1');"''' ++ result = [ ++ {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() ++ ] ++ ++ return {cmd: result} ++ ++ ++# -------------------- 6. 内存使用(实时快照) -------------------- ++@snapshot_task( ++ cmd='''gsql -d tpch -A -F , -c " ++ SELECT ++ 'localhost' AS node_name, ++ SUM(usedsize) AS dynamic_used_memory_bytes, ++ MAX(usedsize) AS dynamic_peak_memory_bytes ++ FROM gs_session_memory_detail;"''', ++ collect_mode=CollectMode.ASYNC, ++ tag="GaussDB内存使用", ++) ++def gauss_memory_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = { ++ "node_name": "节点名", ++ "dynamic_used_memory": "已使用动态内存(MB)", ++ "dynamic_peak_memory": "动态内存峰值(MB)", ++ } ++ cmd = '''gsql -d tpch -A -F , -c " ++ SELECT ++ 'localhost' AS node_name, ++ SUM(usedsize) AS dynamic_used_memory_bytes, ++ MAX(usedsize) AS dynamic_peak_memory_bytes ++ FROM gs_session_memory_detail;"''' ++ result = [ ++ {mapping.get(k, k): v for k, v in row.items()} for _, row in df.iterrows() ++ ] ++ return {cmd: result} +diff --git a/copilot-tune/src/performance_collector/application/mysql_collector.py b/copilot-tune/src/performance_collector/application/mysql_collector.py +new file mode 100644 +index 0000000..6559589 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/mysql_collector.py +@@ -0,0 +1,145 @@ ++from typing import Dict ++ ++from src.utils.collector.metric_collector import ( ++ snapshot_task, ++ CollectMode, ++) ++from src.utils.config.global_config import env_config ++ ++mysql_config = env_config.get("app_config").get("mysql") ++mysql_user = mysql_config["user"] ++mysql_password = mysql_config["password"] ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Connections';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL服务器连接次数", ++) ++def parse_mysql_connections(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Connections';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Uptime';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL启动时间(秒)", ++) ++def parse_mysql_uptime(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Uptime';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Innodb_rows_%';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL Innodb 行操作数", ++) ++def parse_mysql_innodb_rows(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Innodb_rows_%';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_select';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL SELECT 执行次数", ++) ++def parse_mysql_com_select(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_select';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_insert';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL INSERT 执行次数", ++) ++def parse_mysql_com_insert(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_insert';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_update';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL UPDATE 执行次数", ++) ++def parse_mysql_com_update(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_update';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_delete';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL DELETE 执行次数", ++) ++def parse_mysql_com_delete(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Com_delete';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW STATUS LIKE '%THREAD%';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL 线程信息", ++) ++def parse_mysql_threads(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW STATUS LIKE '%THREAD%'\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Slow_queries';\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL 慢查询次数", ++) ++def parse_mysql_slow_queries(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW GLOBAL STATUS LIKE 'Slow_queries';\"" ++ result = _mysql_parse(output) ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW PROFILES;\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL Profiling 信息", ++) ++def parse_mysql_profiles(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW PROFILES\"" ++ return {cmd: output} ++ ++ ++@snapshot_task( ++ cmd=f"mysql -s -N -e \"SHOW PROCESSLIST;\" -u{mysql_user} -p{mysql_password}", ++ collect_mode=CollectMode.ASYNC, ++ tag="MySQL ProcessList 信息", ++) ++def parse_mysql_processlist(output: str) -> Dict: ++ cmd = "mysql -s -N -e \"SHOW PROCESSLIST\"" ++ return {cmd: output} ++ ++ ++def _mysql_parse(stdout: str) -> Dict: ++ """ ++ 通用MySQL输出解析:按制表符分割成键值对 ++ """ ++ result = {} ++ lines = stdout.strip().split("\n") ++ for line in lines: ++ parts = line.split("\t") ++ if len(parts) != 2: ++ continue ++ key, value = parts ++ result[key.strip()] = value.strip() ++ return result +diff --git a/copilot-tune/src/performance_collector/application/nginx_collector.py b/copilot-tune/src/performance_collector/application/nginx_collector.py +new file mode 100644 +index 0000000..77c7f52 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/nginx_collector.py +@@ -0,0 +1,88 @@ ++import logging ++import re ++from src.utils.collector.metric_collector import ( ++ period_task, ++ CollectMode, ++) ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++NGINX_SAMPLE_INTERVAL = 5 # 每次采样间隔 ++SAMPLE_COUNT = 13 # 采样次数 ++DURATION = (SAMPLE_COUNT - 1) * NGINX_SAMPLE_INTERVAL # 统计总时长(秒) ++ ++ ++def parse_stub_status_text(text: str) -> dict: ++ result = {} ++ try: ++ lines = text.strip().splitlines() ++ for line in lines: ++ line = line.strip() ++ if line.startswith("Active connections"): ++ result["active_connections"] = int(line.split(":")[1].strip()) ++ elif line.startswith("server accepts handled requests"): ++ continue ++ elif re.match(r"^\d+\s+\d+\s+\d+", line): ++ parts = list(map(int, line.split())) ++ result["accepts"] = parts[0] ++ result["handled"] = parts[1] ++ result["requests"] = parts[2] ++ elif line.startswith("Reading"): ++ parts = re.findall(r"(\w+):\s*(\d+)", line) ++ for key, value in parts: ++ result[key.lower()] = int(value) ++ except Exception as e: ++ result["error"] = f"Failed to parse stub_status: {str(e)}" ++ return result ++ ++ ++@period_task( ++ cmd="curl -s http://127.0.0.1:10000/status", ++ tag="nginx_status指标", ++ delay=0, ++ sample_count=SAMPLE_COUNT, ++ interval=NGINX_SAMPLE_INTERVAL, ++ collect_mode=CollectMode.ASYNC ++) ++def parse_nginx_status(output: list[str]) -> dict: ++ parsed_list = [parse_stub_status_text(text) for text in output if text] ++ if len(parsed_list) < 2: ++ return {} ++ ++ # 统计连接状态均值 ++ conn_keys = ["active_connections", "reading", "writing", "waiting"] ++ conn_sum = {k: 0 for k in conn_keys} ++ valid_samples = 0 ++ ++ for item in parsed_list: ++ if all(k in item for k in conn_keys): ++ valid_samples += 1 ++ for k in conn_keys: ++ conn_sum[k] += item.get(k, 0) ++ ++ avg_conns = { ++ f"{DURATION}s内平均{k}": conn_sum[k] // valid_samples ++ for k in conn_keys ++ } ++ ++ # 统计累加指标的增量(以第1条和最后1条为基准) ++ try: ++ accepts_delta = parsed_list[-1]["accepts"] - parsed_list[0]["accepts"] ++ handled_delta = parsed_list[-1]["handled"] - parsed_list[0]["handled"] ++ requests_delta = parsed_list[-1]["requests"] - parsed_list[0]["requests"] ++ avg_qps = requests_delta // DURATION ++ except Exception: ++ accepts_delta = handled_delta = requests_delta = avg_qps = 0 ++ ++ result = { ++ f"{DURATION}s内请求总数增长": requests_delta, ++ f"{DURATION}s内接收连接数增长(accepts)": accepts_delta, ++ f"{DURATION}s内处理连接数增长(handled)": handled_delta, ++ f"{DURATION}s内平均QPS": avg_qps, ++ } ++ result.update(avg_conns) ++ return {"curl -s http://127.0.0.1:10000/status": result} ++ +diff --git a/copilot-tune/src/performance_collector/application/pgsql_collector.py b/copilot-tune/src/performance_collector/application/pgsql_collector.py +new file mode 100644 +index 0000000..0d2c04f +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/pgsql_collector.py +@@ -0,0 +1,128 @@ ++import logging ++from io import StringIO ++ ++import pandas as pd ++ ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++) ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++BIG_WRITER_COLLECT_INTERVAL = 180 ++ ++ ++# 采集5分钟内数据 ++@period_task( ++ cmd="su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT * FROM pg_stat_bgwriter;'\"", ++ collect_mode=CollectMode.ASYNC, ++ tag="pgsql缓存指标", ++ delay=0, ++ sample_count=2, ++ interval=BIG_WRITER_COLLECT_INTERVAL ++) ++def pg_stat_bgwriter_parser(output: list[str]) -> dict: ++ if len(output) < 2: ++ return {} # 需要两次采样才能计算差值 ++ ++ df1 = pd.read_csv(StringIO(output[0])) ++ df2 = pd.read_csv(StringIO(output[1])) ++ ++ if df1.empty or df2.empty: ++ return {} ++ ++ row1 = df1.iloc[0].to_dict() ++ row2 = df2.iloc[0].to_dict() ++ ++ mapping = { ++ "checkpoints_timed": "定时检查点次数", ++ "checkpoints_req": "请求检查点次数", ++ "checkpoint_write_time": "检查点写入耗时(ms)", ++ "checkpoint_sync_time": "检查点同步耗时(ms)", ++ "buffers_checkpoint": "检查点写出页数", ++ "buffers_clean": "后台清理写出页数", ++ "maxwritten_clean": "超限触发写出次数", ++ "buffers_backend": "后端写出页数", ++ "buffers_backend_fsync": "后端 fsync 次数", ++ "buffers_alloc": "分配新缓冲区页数", ++ } ++ ++ result = {} ++ for key, label in mapping.items(): ++ new_label = f"{BIG_WRITER_COLLECT_INTERVAL // 60}分钟内{label}" ++ ++ old_val = row1.get(key, 0) ++ new_val = row2.get(key, 0) ++ ++ try: ++ delta = int(new_val) - int(old_val) ++ except (ValueError, TypeError): ++ delta = 0 # 如果解析失败就默认 0 ++ ++ result[new_label] = max(delta, 0) # 防止 PostgreSQL 重启导致出现负值 ++ cmd = "su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT * FROM pg_stat_bgwriter;'\"" ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd="su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT datname, state, wait_event_type, wait_event FROM pg_stat_activity;'\"", ++ collect_mode=CollectMode.ASYNC, ++ tag="pgsql数据库连接信息", ++) ++def pg_stat_activity_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = { ++ "datname": "数据库名", ++ "state": "连接状态", ++ "wait_event_type": "等待事件类型", ++ "wait_event": "等待事件", ++ } ++ result = [] ++ for _, row in df.iterrows(): ++ raw = dict(row) ++ result.append({mapping.get(k, k): v for k, v in raw.items()}) ++ cmd = "su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT datname, state, wait_event_type, wait_event FROM pg_stat_activity;'\"" ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd="su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT datname, numbackends, xact_commit, xact_rollback, blks_read, blks_hit FROM pg_stat_database;'\"", ++ collect_mode=CollectMode.ASYNC, ++ tag="pgsql数据库指标", ++) ++def pg_stat_database_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = { ++ "datname": "数据库名", ++ "numbackends": "连接数", ++ "xact_commit": "提交事务数", ++ "xact_rollback": "回滚事务数", ++ "blks_read": "磁盘读块数", ++ "blks_hit": "缓冲命中块数", ++ } ++ result = [] ++ for _, row in df.iterrows(): ++ raw = dict(row) ++ result.append({mapping.get(k, k): v for k, v in raw.items()}) ++ cmd = "su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT datname, numbackends, xact_commit, xact_rollback, blks_read, blks_hit FROM pg_stat_database;'\"" ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd="su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT mode, granted, COUNT(*) as count FROM pg_locks GROUP BY mode, granted;'\"", ++ collect_mode=CollectMode.ASYNC, ++ tag="pgsql锁指标", ++) ++def pg_locks_parser(output: str) -> dict: ++ df = pd.read_csv(StringIO(output)) ++ mapping = {"mode": "锁模式", "granted": "是否已授予", "count": "锁数量"} ++ result = [] ++ for _, row in df.iterrows(): ++ raw = dict(row) ++ result.append({mapping.get(k, k): v for k, v in raw.items()}) ++ cmd = "su - postgres -c \"/usr/local/pgsql/bin/psql --csv -c 'SELECT mode, granted, COUNT(*) as count FROM pg_locks GROUP BY mode, granted;'\"" ++ return {cmd: result} +diff --git a/copilot-tune/src/performance_collector/application/redis_collector.py b/copilot-tune/src/performance_collector/application/redis_collector.py +new file mode 100644 +index 0000000..c36186a +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/redis_collector.py +@@ -0,0 +1,107 @@ ++from src.utils.collector.metric_collector import ( ++ snapshot_task, ++ CollectMode, ++) ++ ++ ++@snapshot_task( ++ cmd="redis-cli INFO", ++ collect_mode=CollectMode.ASYNC, ++ tag="Redis 实例的基本运行状态", ++) ++def parse_redis_info(info_output: str) -> dict: ++ """解析 redis-cli info 命令输出为带中文 key 的字典""" ++ info = {} ++ for line in info_output.strip().splitlines(): ++ if not line or line.startswith("#"): ++ continue ++ if ":" not in line: ++ continue ++ key, value = line.strip().split(":", 1) ++ info.update( ++ { ++ "运行时间(秒)": ( ++ info.get("运行时间(秒)") or int(value) ++ if key == "uptime_in_seconds" ++ else None ++ ), ++ "已连接客户端数": ( ++ int(value) ++ if key == "connected_clients" ++ else info.get("已连接客户端数") ++ ), ++ "内存使用(字节)": ( ++ int(value) if key == "used_memory" else info.get("内存使用(字节)") ++ ), ++ "每秒请求数(QPS)": ( ++ int(value) ++ if key == "instantaneous_ops_per_sec" ++ else info.get("每秒请求数(QPS)") ++ ), ++ "总命中次数": ( ++ int(value) if key == "keyspace_hits" else info.get("总命中次数") ++ ), ++ "总未命中次数": ( ++ int(value) if key == "keyspace_misses" else info.get("总未命中次数") ++ ), ++ "阻塞客户端数": ( ++ int(value) if key == "blocked_clients" else info.get("阻塞客户端数") ++ ), ++ } ++ ) ++ cmd = "redis-cli INFO" ++ result = {k: v for k, v in info.items() if v is not None} ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd="redis-cli INFO commandstats", ++ collect_mode=CollectMode.ASYNC, ++ tag="Redis 命令的调用次数、耗时", ++) ++def parse_commandstats(commandstats_output: str) -> dict: ++ """解析 commandstats 为每个命令调用次数和平均耗时""" ++ result = {} ++ for line in commandstats_output.strip().splitlines(): ++ if not line.startswith("cmdstat_"): ++ continue ++ parts = line.split(":") ++ cmd = parts[0].replace("cmdstat_", "") ++ values = dict(item.split("=") for item in parts[1].split(",")) ++ result[cmd] = { ++ "调用次数": int(values.get("calls", 0)), ++ "总耗时(微秒)": int(values.get("usec", 0)), ++ "平均耗时(微秒)": float(values.get("usec_per_call", 0)), ++ } ++ cmd = "redis-cli INFO commandstats" ++ return {cmd: result} ++ ++ ++@snapshot_task( ++ cmd="redis-cli INFO stats", ++ collect_mode=CollectMode.ASYNC, ++ tag="Redis key的命中率", ++) ++def parse_hit_rate_from_info_stats(info_stats_output: str) -> dict: ++ """ ++ 从 redis-cli INFO stats 的输出字符串中解析 key 命中率。 ++ 参数: ++ info_stats_output (str): INFO stats 命令的原始输出 ++ 返回: ++ dict: {'命中次数': ..., '未命中次数': ..., '命中率(%)': ...} ++ """ ++ hits = 0 ++ misses = 0 ++ ++ for line in info_stats_output.strip().splitlines(): ++ line = line.strip() ++ if line.startswith("keyspace_hits:"): ++ hits = int(line.split(":")[1]) ++ elif line.startswith("keyspace_misses:"): ++ misses = int(line.split(":")[1]) ++ ++ total = hits + misses ++ hit_rate = round(hits / total * 100, 2) if total else 0.0 ++ cmd = "redis-cli INFO commandstats" ++ result = {"命中次数": hits, "未命中次数": misses, "命中率(%)": hit_rate} ++ return {cmd: result} +diff --git a/copilot-tune/src/performance_collector/application/spark_collector.py b/copilot-tune/src/performance_collector/application/spark_collector.py +new file mode 100644 +index 0000000..f0361ce +--- /dev/null ++++ b/copilot-tune/src/performance_collector/application/spark_collector.py +@@ -0,0 +1,146 @@ ++import logging ++import requests ++import json ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++) ++from src.config import config ++ ++HOST_IP = config["servers"][0]["ip"] ++SPARK_HISTORY_SERVER = f"http://{HOST_IP}:18080" ++SAMPLE_INTERVAL = 60 ++SAMPLE_COUNT = 2 ++DURATION = SAMPLE_INTERVAL * (SAMPLE_COUNT - 1) ++ ++logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") ++ ++ ++@snapshot_task( ++ cmd="curl -s {}/api/v1/applications | jq -r '.[0].id'".format(SPARK_HISTORY_SERVER), ++ tag="spark作业信息", ++ collect_mode=CollectMode.ASYNC ++) ++def spark_job_info(app_id: str) -> dict: ++ app_id = app_id.strip().strip('"') ++ if not app_id: ++ return {} ++ try: ++ cmd = f"{SPARK_HISTORY_SERVER}/api/v1/applications/{app_id}/jobs" ++ resp = requests.get(cmd, timeout=10) ++ jobs = resp.json() ++ total_jobs = len(jobs) ++ running_jobs = sum(1 for job in jobs if job["status"] == "RUNNING") ++ failed_jobs = sum(1 for job in jobs if job["status"] == "FAILED") ++ total_tasks = sum(job.get("numTasks") for job in jobs) ++ total_failed_tasks = sum(job.get("numFailedTasks") for job in jobs) ++ total_killed_tasks = sum(job.get("numKilledTasks") for job in jobs) ++ total_skipped_tasks = sum(job.get("numSkippedTasks") for job in jobs) ++ total_completed_stages = sum(job.get("numCompletedStages") for job in jobs) ++ result = { ++ "Job总数": total_jobs, ++ "运行中Job数": running_jobs, ++ "失败Job数": failed_jobs, ++ "任务总数": total_tasks, ++ "失败Task总数": total_failed_tasks, ++ "被杀Task总数": total_killed_tasks, ++ "跳过Task总数": total_skipped_tasks, ++ "已完成Stage总数": total_completed_stages, ++ } ++ return {cmd: result} ++ ++ except Exception as e: ++ logging.warning(f"获取 job 信息失败: {e}") ++ return {} ++ ++ ++@snapshot_task( ++ cmd="curl -s {}/api/v1/applications | jq -r '.[0].id'".format(SPARK_HISTORY_SERVER), ++ tag="spark阶段信息", ++ collect_mode=CollectMode.ASYNC ++) ++def spark_stage_info(app_id: str) -> dict: ++ app_id = app_id.strip().strip('"') # 去掉引号与换行 ++ if not app_id: ++ return {} ++ ++ try: ++ cmd = f"{SPARK_HISTORY_SERVER}/api/v1/applications/{app_id}/stages" ++ resp = requests.get(cmd, timeout=10) ++ stages = resp.json() ++ total_stages = len(stages) ++ total_tasks = sum(s.get("numTasks", 0) for s in stages) ++ total_executor_time = sum(s.get("executorRunTime", 0) for s in stages) ++ total_gc_time = sum(s.get("jvmGcTime", 0) for s in stages) ++ total_mem_spill = sum(s.get("memoryBytesSpilled", 0) for s in stages) ++ total_disk_spill = sum(s.get("diskBytesSpilled", 0) for s in stages) ++ failed_stages = sum(1 for s in stages if s["status"] == "FAILED") ++ result = { ++ "Stage总数": total_stages, ++ "失败Stage数": failed_stages, ++ "总任务数": total_tasks, ++ "总执行时间(ms)": total_executor_time, ++ "总GC时间(ms)": total_gc_time, ++ "GC占比": f"{(total_gc_time / total_executor_time) * 100:.2f}%" if total_executor_time else "0%", ++ "总Memory Spill": total_mem_spill, ++ "总Disk Spill": total_disk_spill, ++ } ++ return {cmd: result} ++ except Exception as e: ++ logging.warning(f"获取 stage 信息失败: {e}") ++ return {} ++ ++ ++@period_task( ++ cmd="curl -s {}/api/v1/applications/$(curl -s {}/api/v1/applications | jq -r '.[0].id')/executors".format( ++ SPARK_HISTORY_SERVER, SPARK_HISTORY_SERVER ++ ), ++ tag="spark执行器信息", ++ collect_mode=CollectMode.ASYNC, ++ delay=0, ++ sample_count=SAMPLE_COUNT, ++ interval=SAMPLE_INTERVAL ++) ++def spark_executor_info(output: list[str]) -> dict: ++ if len(output) < 2: ++ return {} ++ try: ++ cmd = "curl -s {}/api/v1/applications/$(curl -s {}/api/v1/applications | jq -r '.[0].id')/executors".format( ++ SPARK_HISTORY_SERVER, SPARK_HISTORY_SERVER ++ ) ++ data1 = json.loads(output[0]) ++ data2 = json.loads(output[1]) ++ ++ def agg(executors): ++ filtered = [e for e in executors if e.get("id") != "driver"] ++ return { ++ "executor_count": len(filtered), ++ "total_cores": sum(e.get("totalCores", 0) for e in filtered), ++ "total_tasks": sum(e.get("totalTasks", 0) for e in filtered), ++ "failed_tasks": sum(e.get("failedTasks", 0) for e in filtered), ++ "total_gc_time": sum(e.get("totalGCTime", 0) for e in filtered), ++ } ++ ++ metrics1 = agg(data1) ++ metrics2 = agg(data2) ++ delta_tasks = max(0, metrics2["total_tasks"] - metrics1["total_tasks"]) ++ delta_gc = max(0, metrics2["total_gc_time"] - metrics1["total_gc_time"]) ++ avg_tasks_per_executor = ( ++ delta_tasks // metrics2["executor_count"] ++ if metrics2["executor_count"] > 0 else 0 ++ ) ++ result = { ++ f"{DURATION}s内任务总量": metrics1["total_tasks"] + metrics2["total_tasks"], ++ f"{DURATION}s内GC总耗时(ms)": metrics1["total_gc_time"] + metrics2["total_gc_time"], ++ f"{DURATION}s内任务增长量": delta_tasks, ++ f"{DURATION}s内GC总耗时增长量(ms)": delta_gc, ++ "Executor数": metrics2["executor_count"], ++ "总核数": metrics2["total_cores"], ++ "失败任务数": metrics2["failed_tasks"], ++ f"{DURATION}s内平均每Executor任务增长数": avg_tasks_per_executor ++ } ++ return {cmd: result} ++ except Exception as e: ++ logging.error(f"解析 executor 指标失败: {e}") ++ return {} +diff --git a/copilot-tune/src/performance_collector/base_collector.py b/copilot-tune/src/performance_collector/base_collector.py +new file mode 100644 +index 0000000..5e2fb27 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/base_collector.py +@@ -0,0 +1,63 @@ ++from abc import abstractmethod ++from typing import Dict, Any ++from typing import List, Optional ++ ++from pydantic import BaseModel, Field ++ ++from src.utils.shell_execute import SshClient # 假设这个是你的类 ++ ++ ++class CollectorArgs(BaseModel): ++ cmds: List[str] = Field(default_factory=list) ++ ssh_client: Optional[SshClient] = None ++ ++ model_config = { ++ "arbitrary_types_allowed": True ++ } ++ ++ ++class BaseCollector: ++ def __init__(self, **kwargs): ++ # 使用pydantic模型的构造函数来初始化args ++ self.args = CollectorArgs(**kwargs) ++ ++ def get_cmd_stdout( ++ self, ++ ) -> Dict: ++ # 执行远程命令 ++ result = {} ++ for cmd in self.args.cmds: ++ cmd_res = self.args.ssh_client.run_cmd( ++ cmd=cmd ++ ) ++ res = {cmd: cmd_res.output} ++ result = {**result, **res} ++ return result ++ ++ @abstractmethod ++ def parse_cmd_stdout(self, **kwargs) -> Dict: ++ pass ++ ++ def default_parse( ++ self, ++ cmd: str, ++ stdout: Any, ++ ) -> Dict: ++ return {cmd: stdout} ++ ++ @abstractmethod ++ def data_process(self, **kwargs) -> Dict: ++ pass ++ ++ def run(self) -> Dict: ++ # 1. 获取命令执行结果 ++ cmd_stdout = self.get_cmd_stdout() ++ ++ # 2. 解析命令输出 ++ parsed_data = self.parse_cmd_stdout(cmd_stdout) ++ ++ # 3. 处理数据 ++ processed_data = self.data_process(parsed_data) ++ ++ # 4. 返回处理后的数据 ++ return processed_data +diff --git a/copilot-tune/src/performance_collector/cpu_collector.py b/copilot-tune/src/performance_collector/cpu_collector.py +new file mode 100644 +index 0000000..3b5e298 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/cpu_collector.py +@@ -0,0 +1,304 @@ ++from .base_collector import BaseCollector ++from typing import Dict, Any, List ++import logging ++import json ++from enum import Enum ++ ++class CpuMetric(Enum): ++ ONE_MINUTE_AVG_LOAD = "1min" ++ FIVE_MINUTE_AVG_LOAD = "5min" ++ TEN_MINUTE_AVG_LOAD = "10min" ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++perf = "perf stat -e 'syscalls:*' -a sleep 1 2>&1 | grep syscalls| awk '{sum += $1} END {print sum}'" ++ ++def get_cpu_cmd()-> List[str]: ++ return list(CPU_PARSE_FUNCTIONS.keys()) ++ ++def nproc_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "nproc": ++ logging.error("Command is not 'nproc'.") ++ raise ValueError("Command must be 'nproc'") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ logical_cpu_cores = int(stdout.split("\n")[0]) ++ except (IndexError, ValueError) as e: ++ logging.error(f"Failed to parse CPU count from stdout: {e}") ++ raise ValueError("Failed to parse CPU count from stdout") from e ++ ++ res = {"cpu核数": logical_cpu_cores} ++ return res ++ ++def loadavg_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "cat /proc/loadavg": ++ logging.error("Command is not 'cat /proc/loadavg'.") ++ raise ValueError("Command must be 'cat /proc/loadavg'") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ out = stdout.split("\n") ++ data = out[0].split() ++ if len(data) < 3: ++ raise ValueError("Not enough data to parse load averages.") ++ ++ load_avgs = {"过去1min平均负载": float(data[0]), ++ "过去5min平均负载": float(data[1]), ++ "过去10min平均负载": float(data[2])} ++ except (IndexError, ValueError) as e: ++ logging.error(f"Failed to parse system load averages from stdout: {e}") ++ raise ValueError("Failed to parse system load averages from stdout") from e ++ ++ return load_avgs ++ ++def perf_syscall_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != perf: ++ logging.error("Command is not 'perf'.") ++ raise ValueError("Command must be 'perf'") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ sys_call_rate = float(stdout.split("\n")[0]) ++ except (IndexError, ValueError) as e: ++ logging.error(f"Failed to parse system call rate from stdout: {e}") ++ raise ValueError("Failed to parse system call rate from stdout") from e ++ ++ res = {"系统单位时间调用次数": sys_call_rate} ++ return res ++ ++def mpstat_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "mpstat -P ALL -o JSON 1 1": ++ logging.error("Command is not 'mpstat'.") ++ raise ValueError("Command must be 'mpstat'") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ stdout_data = json.loads(stdout) ++ data = stdout_data["sysstat"]["hosts"][0]["statistics"][0]["cpu-load"][0] ++ ++ usr, nice, sys, iowait, irq, soft, steal, guest, gnice, idle = map(float, (data["usr"], data["nice"], data["sys"], data["iowait"], data["irq"], data["soft"], data["steal"], data["guest"], data["gnice"], data["idle"])) ++ ++ res = { ++ "用户态中的cpu利用率": usr, ++ "具有nice优先级的用户态CPU使用率": nice, ++ "kernel内核态执行时的CPU利用率": sys, ++ "系统有未完成的磁盘I/O请求时,等待IO占用CPU的百分比": iowait, ++ "硬中断占用CPU时间的百分比": irq, ++ "软中断占用CPU时间的百分比": soft, ++ "虚拟化环境中,其他虚拟机占用的CPU时间百分比": steal, ++ "运行虚拟处理器时CPU花费时间的百分比": guest, ++ "运行带有nice优先级的虚拟CPU所花费的时间百分比": gnice, ++ "CPU处在空闲状态的时间百分比": idle ++ } ++ except json.JSONDecodeError as e: ++ logging.error(f"Failed to parse JSON from stdout: {e}") ++ raise ValueError("Failed to parse JSON from stdout") from e ++ except (IndexError, ValueError, TypeError) as e: ++ logging.error(f"Failed to parse mpstat CPU statistics: {e}") ++ raise ValueError("Failed to parse mpstat CPU statistics") from e ++ ++ return res ++ ++def process_parse(cmd, stdout): ++ if cmd != "ps aux|wc -l": ++ logging.error("Command is not 'ps aux|wc -l'.") ++ raise ValueError("Command is not 'ps aux|wc -l'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ total_process = float(stdout.split("\n")[0]) ++ except (ValueError, IndexError) as e: ++ logging.error(f"Failed to parse total process count from stdout: {e}") ++ raise ValueError("Failed to parse total process count from stdout") from e ++ ++ res = {"总进程数": total_process} ++ return res ++ ++def vmstat_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "vmstat 1 2": ++ logging.error("Command is not 'vmstat'.") ++ raise ValueError("Command is not 'vmstat'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ out = stdout.split("\n") ++ out.pop() ++ data = out[-1].split() ++ ++ runtime_num = int(data[0]) ++ blocked_num = int(data[1]) ++ context_switch = int(data[11]) ++ ++ res = { ++ "运行队列中进程的数量": runtime_num, ++ "被阻塞的进程数": blocked_num, ++ "系统每秒进行上下文切换的次数": context_switch ++ } ++ except IndexError as e: ++ logging.error(f"Failed to parse vmstat memory attributes: {e}") ++ raise ValueError("Failed to parse vmstat memory attributes from stdout") from e ++ except ValueError as e: ++ logging.error(f"Failed to convert vmstat values to expected types: {e}") ++ raise ValueError("Failed to convert vmstat values to expected types") from e ++ ++ return res ++ ++def pid_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "pidstat -d | head -6": ++ logging.error("Command is not 'pidstat'.") ++ raise ValueError("Command is not 'pidstat'.") ++ return {"进程信息": stdout} ++ ++CPU_PARSE_FUNCTIONS = { ++ "nproc": nproc_parse, ++ "cat /proc/loadavg": loadavg_parse, ++ perf: perf_syscall_parse, ++ "mpstat -P ALL -o JSON 1 1": mpstat_parse, ++ "ps aux|wc -l": process_parse, ++ "vmstat 1 2": vmstat_parse, ++ "pidstat -d | head -6": pid_parse, ++} ++ ++class CpuCollector(BaseCollector): ++ def __init__(self, cmd: List[str], **kwargs): ++ # 将cmd添加到kwargs中 ++ kwargs['cmds'] = cmd ++ super().__init__(**kwargs) ++ ++ def parse_cmd_stdout( ++ self, ++ cpu_info_stdout: Dict[str, Any], ++ ) -> Dict: ++ parse_result = {} ++ for k, v in cpu_info_stdout.items(): ++ # 使用字典获取对应的解析函数,如果cmd不在字典中,使用默认的解析函数 ++ parse_function = CPU_PARSE_FUNCTIONS.get(k, self.default_parse) ++ cmd_parse_result = parse_function(k, v) ++ parse_result = {**parse_result, **cmd_parse_result} ++ return parse_result ++ ++ def normalize_percentage( ++ self, ++ value: Any, ++ total: float, ++ ) -> float: ++ return value / total if total != 0 else 0 ++ ++ def is_heavy_load( ++ self, ++ usage: float, ++ ) -> bool: ++ return usage > 70 ++ ++ def data_process( ++ self, ++ cpu_parse_result: Dict, ++ ) -> Dict: ++ logging.info(f"[CpuCollector] collecting cpu workload metrics") ++ cpu_process_result = {} ++ ++ # 计算平均负载 ++ for metric in [CpuMetric.ONE_MINUTE_AVG_LOAD, CpuMetric.FIVE_MINUTE_AVG_LOAD, CpuMetric.TEN_MINUTE_AVG_LOAD]: ++ cpu_process_result[metric.value] = self.normalize_percentage( ++ cpu_parse_result[f"过去{metric.value}平均负载"], ++ cpu_parse_result["cpu核数"] ++ ) ++ ++ # 计算CPU利用率 ++ cpu_utilizations = [ ++ "用户态中的cpu利用率", ++ "具有nice优先级的用户态CPU使用率", ++ "kernel内核态执行时的CPU利用率" ++ ] ++ for utilization in cpu_utilizations: ++ cpu_process_result[utilization] = self.normalize_percentage( ++ cpu_parse_result[utilization], 100 ++ ) ++ ++ # 其他百分比计算 ++ for key in [ ++ "硬中断占用CPU时间的百分比", ++ "软中断占用CPU时间的百分比", ++ "虚拟化环境中,其他虚拟机占用的CPU时间百分比", ++ "运行虚拟处理器时CPU花费时间的百分比", ++ "运行带有nice优先级的虚拟CPU所花费的时间百分比" ++ ]: ++ cpu_process_result[key] = self.normalize_percentage( ++ cpu_parse_result[key], 100 ++ ) ++ ++ # 计算CPU利用率和上下文切换次数 ++ cpu_process_result["CPU利用率"] = 1 - self.normalize_percentage( ++ cpu_parse_result["CPU处在空闲状态的时间百分比"], 100 ++ ) ++ cpu_process_result["系统每秒进行上下文切换的次数"] = cpu_parse_result.get( ++ "系统每秒进行上下文切换的次数", 0 ++ ) ++ ++ # 阻塞进程率 ++ cpu_process_result["阻塞进程率"] = self.normalize_percentage( ++ cpu_parse_result["被阻塞的进程数"], cpu_parse_result["总进程数"] ++ ) ++ ++ # 确保内核态执行时的CPU利用率不为0 ++ cpu_process_result["kernel内核态执行时的CPU利用率"] = max( ++ 0.01, cpu_process_result["kernel内核态执行时的CPU利用率"] ++ ) ++ ++ # 判断计算密集型或IO密集型 ++ user_mode_ratio = cpu_process_result["用户态中的cpu利用率"] / cpu_process_result["kernel内核态执行时的CPU利用率"] ++ is_heavy_io = self.is_heavy_load(cpu_process_result["用户态中的cpu利用率"]) or self.is_heavy_load(cpu_process_result["kernel内核态执行时的CPU利用率"]) ++ ++ if user_mode_ratio > 2: ++ cpu_process_result["计算密集型"] = 1 if is_heavy_io else 0 ++ else: ++ cpu_process_result["计算密集型"] = 0 ++ ++ if user_mode_ratio < 2: ++ cpu_process_result["IO密集型"] = 1 if is_heavy_io else 0 ++ else: ++ cpu_process_result["IO密集型"] = 0 ++ ++ # 复制其他信息 ++ cpu_process_result["进程信息"] = cpu_parse_result.get("进程信息", []) ++ cpu_process_result["系统单位时间调用次数"] = cpu_parse_result.get("系统单位时间调用次数", 0) ++ cpu_process_result["cpu核数"] = cpu_parse_result.get("cpu核数", 0) ++ ++ return cpu_process_result +diff --git a/copilot-tune/src/performance_collector/disk_collector.py b/copilot-tune/src/performance_collector/disk_collector.py +new file mode 100644 +index 0000000..e35cd8c +--- /dev/null ++++ b/copilot-tune/src/performance_collector/disk_collector.py +@@ -0,0 +1,117 @@ ++from .base_collector import BaseCollector ++from typing import Dict, Any, List ++import logging ++import json ++from enum import Enum ++ ++class DiskMetric(Enum): ++ TODO = "XX" ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++def get_disk_cmd()-> List[str]: ++ return list(DISK_PARSE_FUNCTIONS.keys()) ++ ++def parse_disk_data( ++ data: Dict[str, Any], ++) -> Dict: ++ device_name = data["disk_device"] ++ r, rkb, w, wkb = float(data["r/s"]), float(data["rkB/s"]), float(data["w/s"]), float(data["wkB/s"]) ++ return {device_name: {"单位时间读速率": r, "单位时间读大小": rkb, "单位时间写速率": w, "单位时间写大小": wkb}} ++ ++def parse_disk_util_data(a_data: dict, b_data: dict) -> dict: ++ device_name = b_data["disk_device"] ++ ++ try: ++ await_a = float(a_data.get("await")) ++ await_b = float(b_data.get("await")) ++ await_change = await_b - await_a # 正数表示变慢,负数表示变快 ++ except (TypeError, ValueError, KeyError): ++ # 如果 'await' 不存在或无效,回退到 r_await + w_await 的简单相加 ++ logging.warning(f"Device {device_name}: 'await' not available, falling back to r_await + w_await") ++ r_await_a = float(a_data.get("r_await", 0.0)) ++ w_await_a = float(a_data.get("w_await", 0.0)) ++ r_await_b = float(b_data.get("r_await", 0.0)) ++ w_await_b = float(b_data.get("w_await", 0.0)) ++ await_change = (r_await_b + w_await_b) - (r_await_a + w_await_a) ++ ++ # 磁盘队列长度变化趋势 ++ try: ++ aqu_sz_change = float(b_data["aqu-sz"]) - float(a_data["aqu-sz"]) ++ except (TypeError, ValueError, KeyError): ++ aqu_sz_change = 0.0 ++ ++ # 磁盘利用率(直接取最新值) ++ try: ++ util = float(b_data["util"]) ++ except (TypeError, ValueError, KeyError): ++ util = 0.0 ++ ++ return {device_name: {"磁盘平均等待时间变化趋势": await_change, "磁盘平均请求队列长度变化趋势": aqu_sz_change, "磁盘利用率": util}} ++ ++ ++def iostat_parse(cmd, stdout): ++ if cmd == "iostat -o JSON -dx 1 2": ++ try: ++ stdout = json.loads(stdout) ++ disk = [parse_disk_data(data) for data in stdout["sysstat"]["hosts"][0]["statistics"][1]["disk"]] ++ res = {"磁盘读写性能": disk} ++ except json.JSONDecodeError as e: ++ logging.error(f"Failed to parse JSON from stdout: {e}") ++ raise ValueError("Failed to parse JSON from stdout") from e ++ elif cmd == "iostat -o JSON -dx 1 2; sleep 5; iostat -o JSON -dx 1 2": ++ try: ++ split_index = stdout.index('}\n{"sysstat": {') ++ a, b = stdout[:split_index+2], stdout[split_index+2:] ++ a = a[:-1] ++ a_json = json.loads(a) ++ b_json = json.loads(b) ++ disk = [parse_disk_util_data(a_data, b_data) for a_data, b_data in zip( ++ a_json["sysstat"]["hosts"][0]["statistics"][1]["disk"], ++ b_json["sysstat"]["hosts"][0]["statistics"][1]["disk"])] ++ res = {"磁盘利用": disk} ++ except (ValueError, json.JSONDecodeError) as e: ++ logging.error(f"Failed to parse disk utilization data: {e}") ++ raise ValueError("Failed to parse disk utilization data") from e ++ else: ++ logging.warning("Received unknown command.") ++ return {"error": "Unknown command"} ++ ++ return res ++ ++DISK_PARSE_FUNCTIONS = { ++ "iostat -o JSON -dx 1 2": iostat_parse, ++ "iostat -o JSON -dx 1 2; sleep 5; iostat -o JSON -dx 1 2": iostat_parse, ++} ++ ++class DiskCollector(BaseCollector): ++ def __init__(self, cmd: List[str], **kwargs): ++ kwargs['cmds'] = cmd ++ super().__init__(**kwargs) ++ ++ def parse_cmd_stdout( ++ self, ++ disk_info_stdout: Dict[str, Any], ++ ) -> Dict: ++ parse_result = {} ++ for k, v in disk_info_stdout.items(): ++ # 使用字典获取对应的解析函数,如果cmd不在字典中,使用默认的解析函数 ++ parse_function = DISK_PARSE_FUNCTIONS.get(k, self.default_parse) ++ cmd_parse_result = parse_function(k, v) ++ parse_result = {**parse_result, **cmd_parse_result} ++ return parse_result ++ ++ def data_process( ++ self, ++ disk_parse_result: Dict, ++ ) -> Dict: ++ logging.info(f"[DiskCollector] collecting disk workload metrics") ++ disk_process_result = { ++ # "iowait": disk_parse_result["系统有未完成的磁盘I/O请求时,等待IO占用CPU的百分比"] / 100, ++ "磁盘信息": disk_parse_result["磁盘利用"], ++ } ++ for i in range(len(disk_process_result["磁盘信息"])): ++ for key in disk_process_result["磁盘信息"][i]: ++ disk_process_result["磁盘信息"][i][key].update(disk_parse_result["磁盘读写性能"][i][key]) ++ ++ return disk_process_result +diff --git a/copilot-tune/src/performance_collector/memory_collector.py b/copilot-tune/src/performance_collector/memory_collector.py +new file mode 100644 +index 0000000..80adbe5 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/memory_collector.py +@@ -0,0 +1,151 @@ ++from .base_collector import BaseCollector ++from typing import Dict, Any, List ++import logging ++from enum import Enum ++ ++class MemoryMetric(Enum): ++ TODO = "XX" ++ ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++omm_kill_cmd = "(oom_kill1=$(cat /proc/vmstat | grep oom_kill | awk '{print$2}'); sleep 5; oom_kill2=$(cat /proc/vmstat | grep oom_kill | awk '{print$2}')) && echo $((oom_kill2 - oom_kill1))" ++ ++def get_memory_cmd()-> List[str]: ++ return list(MEMORY_PARSE_FUNCTIONS.keys()) ++def free_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "free": ++ logging.error("Command is not 'free'.") ++ raise ValueError("Command is not 'free'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ out = stdout.split("\n") ++ out.pop() ++ data = out[-1].split() ++ total_swap = float(data[1]) ++ free_swap = float(data[3]) ++ ++ res = {"总的交换空间总量": total_swap, "可用的交换空间总量": free_swap} ++ except IndexError as e: ++ logging.error(f"Failed to parse memory and swap usage: {e}") ++ raise ValueError("Failed to parse memory and swap usage from stdout") from e ++ except ValueError as e: ++ logging.error(f"Failed to convert swap values to float: {e}") ++ raise ValueError("Failed to convert swap values to float") from e ++ ++ return res ++ ++def omm_kill_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != omm_kill_cmd: ++ logging.error("Command is not 'omm_kill'.") ++ raise ValueError("Command is not 'omm_kill'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ omm_kill = float(stdout.split("\n")[0]) ++ res = {"omm_kill": omm_kill} ++ except ValueError as e: ++ logging.error(f"Failed to parse OOM killer count from stdout: {e}") ++ raise ValueError("Failed to parse OOM killer count from stdout") from e ++ ++ return res ++ ++def sar_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "sar -r 1 1": ++ logging.error("Command is not 'sar'.") ++ raise ValueError("Command is not 'sar'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ out = stdout.split("\n") ++ out.pop() ++ date = out[-1].split() ++ memory_usage = float(date[4]) ++ ++ res = {"内存使用率": memory_usage} ++ except IndexError as e: ++ logging.error(f"Failed to parse memory usage from sar output: {e}") ++ raise ValueError("Failed to parse memory usage from sar output") from e ++ except ValueError as e: ++ logging.error(f"Failed to convert memory usage to float: {e}") ++ raise ValueError("Failed to convert memory usage to float") from e ++ ++ return res ++ ++MEMORY_PARSE_FUNCTIONS = { ++ "free": free_parse, ++ omm_kill_cmd: omm_kill_parse, ++ "sar -r 1 1": sar_parse, ++} ++ ++class MemoryCollector(BaseCollector): ++ def __init__(self, cmd: List[str], **kwargs): ++ kwargs['cmds'] = cmd ++ super().__init__(**kwargs) ++ ++ def parse_cmd_stdout( ++ self, ++ memory_info_stdout: Dict[str, Any], ++ ) -> Dict: ++ parse_result = {} ++ for k, v in memory_info_stdout.items(): ++ # 使用字典获取对应的解析函数,如果cmd不在字典中,使用默认的解析函数 ++ parse_function = MEMORY_PARSE_FUNCTIONS.get(k, self.default_parse) ++ cmd_parse_result = parse_function(k, v) ++ parse_result = {**parse_result, **cmd_parse_result} ++ return parse_result ++ ++ def calculate_swap_usage( ++ self, ++ available_swap: float, ++ total_swap: float ++ ) -> float: ++ """计算交换空间使用率""" ++ if total_swap > 0: ++ return 1 - (available_swap / total_swap) ++ else: ++ return 1 ++ ++ def data_process( ++ self, ++ memory_parse_result: Dict, ++ ) -> Dict: ++ logging.info(f"[MemoryCollector] collecting memory workload metrics") ++ memory_process_result = {} ++ ++ # 计算交换空间使用率 ++ memory_process_result["交换空间使用率"] = self.calculate_swap_usage( ++ memory_parse_result["可用的交换空间总量"], ++ memory_parse_result["总的交换空间总量"] ++ ) ++ ++ # 内存使用率 ++ memory_process_result["内存使用率"] = memory_parse_result["内存使用率"] / 100 ++ ++ # # Swapout 判断 ++ # SWAPOUT_THRESHOLD = 5 # 定义阈值常量 ++ # memory_process_result["swapout"] = int(memory_parse_result["每秒从主内存交换到交换空间的页面数"] > SWAPOUT_THRESHOLD) ++ ++ # OOM Killer 判断 ++ memory_process_result["omm_kill"] = int(memory_parse_result["omm_kill"] > 0) ++ ++ return memory_process_result +diff --git a/copilot-tune/src/performance_collector/metric_collector.py b/copilot-tune/src/performance_collector/metric_collector.py +new file mode 100644 +index 0000000..d06fab2 +--- /dev/null ++++ b/copilot-tune/src/performance_collector/metric_collector.py +@@ -0,0 +1,81 @@ ++import logging ++ ++from .cpu_collector import CpuCollector, get_cpu_cmd ++from .disk_collector import DiskCollector, get_disk_cmd ++from .memory_collector import MemoryCollector, get_memory_cmd ++from .network_collector import NetworkCollector, get_network_cmd ++from .base_collector import CollectorArgs ++ ++from src.utils.shell_execute import SshClient ++from src.performance_collector.app_collector import AppCollector ++from src.utils.collector.collector_trigger import TriggerEventListener, TriggerStatus ++ ++ ++triggered_event_listener = TriggerEventListener() ++ ++ ++class MetricCollector: ++ ++ def __init__( ++ self, ++ ssh_client: SshClient, ++ app: str = None, ++ pressure_test_mode: bool = False, ++ ): ++ self.args = CollectorArgs( ++ ssh_client=ssh_client ++ ) ++ self.app = app # 新增app属性 ++ self.pressure_test_mode = pressure_test_mode ++ self.cpu_collector = CpuCollector( ++ cmd=get_cpu_cmd(), ++ ssh_client=self.args.ssh_client, ++ ) ++ self.disk_collector = DiskCollector( ++ cmd=get_disk_cmd(), ++ ssh_client=self.args.ssh_client, ++ ) ++ self.memory_collector = MemoryCollector( ++ cmd=get_memory_cmd(), ++ ssh_client=self.args.ssh_client, ++ ) ++ self.network_collector = NetworkCollector( ++ cmd=get_network_cmd(), ++ ssh_client=self.args.ssh_client, ++ ) ++ self.app_collector = AppCollector( ++ ssh_client=ssh_client, ++ app=app, ++ ) ++ ++ def run(self) -> dict: ++ """ ++ 运行所有数据收集器,收集并返回综合结果。 ++ """ ++ logging.info("[MetricCollector] collecting workload metrics ...") ++ # 全局触发模式,如果需要一边模拟压测一边采集数据时开启,阻塞程序直到可以开始采集数据 ++ if self.pressure_test_mode: ++ logging.info("[MetricCollector] waiting for pressure test initializing ...") ++ event_status = triggered_event_listener.wait() ++ ++ if event_status == TriggerStatus.CLOSE: ++ raise RuntimeError( ++ f"[MetricCollector] waiting for trigger signale timeout, skip tasks" ++ ) ++ # 调用每个子收集器的 run 方法 ++ cpu_data = self.cpu_collector.run() ++ disk_data = self.disk_collector.run() ++ memory_data = self.memory_collector.run() ++ network_data = self.network_collector.run() ++ app_data = self.app_collector.run() ++ ++ # 合并所有收集到的数据 ++ combined_data = { ++ "Cpu": cpu_data, ++ "Disk": disk_data, ++ "Memory": memory_data, ++ "Network": network_data, ++ "Application": app_data, ++ } ++ ++ return combined_data +diff --git a/copilot-tune/src/performance_collector/micro_dep_collector.py b/copilot-tune/src/performance_collector/micro_dep_collector.py +new file mode 100644 +index 0000000..972e92b +--- /dev/null ++++ b/copilot-tune/src/performance_collector/micro_dep_collector.py +@@ -0,0 +1,453 @@ ++import logging ++from abc import ABC, abstractmethod ++from typing import Dict, List ++ ++from src.utils.shell_execute import SshClient ++ ++ ++class COLLECTMODE: ++ DIRECT_MODE = 0 ++ ATTACH_MODE = 1 ++ ++ ++class BaseCollector(ABC): ++ """性能数据收集器基类""" ++ ++ def __init__(self): ++ self.raw_data: Dict[str, float] = {} ++ self.processed_data: Dict[str, float] = {} ++ self.collect_cmd: str = "" ++ ++ @abstractmethod ++ def collect(self): ++ """收集性能数据""" ++ pass ++ ++ @abstractmethod ++ def process(self): ++ """处理原始数据""" ++ pass ++ ++ @staticmethod ++ def is_number(s: str) -> bool: ++ """检查字符串是否为数字""" ++ try: ++ float(s) ++ return True ++ except ValueError: ++ return False ++ ++ ++class PerfCollector(BaseCollector): ++ """基于perf的性能数据收集器""" ++ ++ # 性能事件配置 ++ PMU_CONFIGS = { ++ "topdown": { ++ 'r0011': "cycle", ++ 'r2014': "fetch_bubble", ++ 'r201d': "fetch_bubble_max", ++ 'r001b': "inst_spec", ++ 'r0008': "inst_retired", ++ 'r7001': "execstall_cycle", ++ 'r7003': "fsustall", ++ 'r7004': "memstall_anyload", ++ 'r7005': "memstall_anystore", ++ 'r7006': "memstall_l1miss", ++ 'r7007': "memstall_l2miss", ++ 'r0010': "brmisspred", ++ 'r2013': "o3flush", ++ 'context-switches': "context_switches", ++ 'cpu-migrations': "cpu_migrations", ++ 'page-faults': "page_faults", ++ }, ++ "cache": { ++ 'r0001': 'l1i_refill', ++ 'r0014': 'l1i_access', ++ 'r0003': 'l1d_refill', ++ 'r0004': 'l1d_access', ++ 'r0028': 'l2i_refill', ++ 'r0027': 'l2i_access', ++ 'r0017': 'l2d_refill', ++ 'r0016': 'l2d_access', ++ 'r0008': 'inst_retired', ++ }, ++ "branch": { ++ 'r0011': 'cycle', ++ 'r200b': 'alu_isq_stall', ++ 'r200c': 'lsu_isq_stall', ++ 'r200d': 'fsu_isq_stall', ++ 'r0010': 'brmisspred', ++ 'r0012': 'brpred', ++ }, ++ "tlb": { ++ 'r0002': 'l1i_tlb_refill', ++ 'r0026': 'l1i_tlb', ++ 'r0005': 'l1d_tlb_refill', ++ 'r0025': 'l1d_tlb', ++ 'r002e': 'l2i_tlb_refill', ++ 'r0030': 'l2i_tlb', ++ 'r002d': 'l2d_tlb_refill', ++ 'r002f': 'l2d_tlb', ++ 'r0035': 'itlb_walk', ++ 'r0034': 'dtlb_walk', ++ 'r0008': 'inst_retired', ++ 'r0011': 'cycle', ++ 'r7002': 'divstall', ++ } ++ } ++ ++ # 微架构配置 ++ FW_CONFIG = {'dispatch_size': 4} ++ ++ def __init__( ++ self, ++ config_type: str, ++ ssh_client: SshClient = None, ++ duration: float = 0.1, ++ target_pid: int = 0 ++ ): ++ super().__init__() ++ self.ssh_client = ssh_client ++ self.config_type = config_type ++ self.cfg_pmu = self.PMU_CONFIGS.get(config_type, {}) ++ self.duration = duration ++ self.target_pid = target_pid ++ ++ def set_collector_param( ++ self, ++ ssh_client: SshClient, ++ duration: float = 0.1, ++ target_pid: int = 0 ++ ): ++ """设置收集器参数""" ++ self.ssh_client = ssh_client ++ self.duration = duration ++ self.target_pid = target_pid ++ self._generate_collect_command() ++ ++ def _generate_collect_command(self): ++ """生成perf收集命令""" ++ events = ",".join(self.cfg_pmu.keys()) ++ target = f"-p {self.target_pid}" if self.target_pid else "-a" ++ self.collect_cmd = f"perf stat -e {events} {target} sleep {self.duration}" ++ logging.debug(f"Generated perf command: {self.collect_cmd}") ++ ++ def collect(self): ++ """收集性能数据""" ++ if not self.ssh_client: ++ raise RuntimeError("Host information not set") ++ result = self.ssh_client.run_cmd(self.collect_cmd) ++ self._parse_perf_output(result.err_msg) ++ ++ def _parse_perf_output(self, output: str): ++ """解析perf输出""" ++ for line in output.splitlines(): ++ line = line.rstrip() ++ if not line: ++ continue ++ ++ parts = line.split() ++ if len(parts) < 2: ++ continue ++ ++ value = parts[0].replace(',', '') ++ event_name = parts[1] ++ ++ # 处理未计数的事件 ++ if value == " bool: ++ """检查目标进程是否在运行""" ++ # 检查主目标进程 ++ target_valid = (not self.target_pid or ++ self.is_pid_valid(self.target_pid)) ++ ++ # 在ATTACH模式下检查基准测试进程 ++ benchmark_valid = (self.mode == COLLECTMODE.DIRECT_MODE or ++ (self.mode == COLLECTMODE.ATTACH_MODE and ++ self.is_pid_valid(self.benchmark_pid))) ++ ++ return target_valid and benchmark_valid ++ ++ def run(self) -> Dict[str, float]: ++ """执行性能收集和分析""" ++ if not self.is_target_running(): ++ raise RuntimeError("Target process not running") ++ ++ while self.iter < self.max_iteration: ++ if not self.is_target_running(): ++ break ++ ++ for collector in self.collector_list: ++ collector.collect() ++ ++ self.iter += 1 ++ ++ # 处理收集到的数据 ++ all_data = {} ++ for collector in self.collector_list: ++ collector.process() ++ all_data.update(collector.processed_data) ++ ++ return all_data ++ ++ def is_pid_valid(self, pid) -> bool: ++ """检查PID是否有效""" ++ cmd = f"ps -p {pid} > /dev/null 2>&1" ++ result = self.ssh_client.run_cmd(cmd) ++ return result.status_code == 0 ++ ++ def get_process_pid(self) -> str: ++ """获取进程PID""" ++ cmd = f"pgrep -f {self.target_process_name}" ++ result = self.ssh_client.run_cmd(cmd) ++ if not result.output: ++ raise RuntimeError(f"No process found: {self.target_process_name}") ++ return sorted(result.output.split('\n'))[0] ++ ++ def print_processed_data(self): ++ """打印处理后的性能数据""" ++ for collector in self.collector_list: ++ for metric, value in collector.processed_data.items(): ++ logging.info(f"{metric}: {value:.2f}") +diff --git a/copilot-tune/src/performance_collector/network_collector.py b/copilot-tune/src/performance_collector/network_collector.py +new file mode 100644 +index 0000000..8963ebd +--- /dev/null ++++ b/copilot-tune/src/performance_collector/network_collector.py +@@ -0,0 +1,139 @@ ++import logging ++from enum import Enum ++from typing import Dict, Any, List ++ ++from .base_collector import BaseCollector ++ ++ ++class NetworkMetric(Enum): ++ TODO = "XX" ++ ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++ListenOverflows = ("ListenOverflows1=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$20}' | tail -n 1); sleep 5;" ++ "ListenOverflows2=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$20}' | tail -n 1); " ++ "echo $((ListenOverflows2 - ListenOverflows1))") ++FullDoCookies = ("FullDoCookies1=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$76}' | tail -n 1); sleep 5; " ++ "FullDoCookies2=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$76}' | tail -n 1); " ++ "echo $((FullDoCookies2 - FullDoCookies1))") ++FullDrop = ("FullDrop1=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$77}' | tail -n 1); sleep 5; " ++ "FullDrop2=$(cat /proc/net/netstat | grep 'TcpExt:' | awk '{print$20}' | tail -n 1); " ++ "echo $(( FullDrop2 - FullDrop1))") ++ ++ ++def get_network_cmd() -> List[str]: ++ return list(NETWORK_PARSE_FUNCTIONS.keys()) ++ ++ ++def listenoverflows_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != ListenOverflows: ++ logging.error("Command is not 'ListenOverflows'.") ++ raise ValueError("Command is not 'ListenOverflows'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ listenOverflows = float(stdout.split("\n")[0]) ++ res = {"listenOverflows": listenOverflows} ++ except ValueError as e: ++ logging.error(f"Failed to parse listen overflows count from stdout: {e}") ++ raise ValueError("Failed to parse listen overflows count from stdout") from e ++ ++ return res ++ ++ ++def fulldocookies_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != FullDoCookies: ++ logging.error("Command is not 'FullDoCookies'.") ++ raise ValueError("Command is not 'FullDoCookies'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ fulldocookies = float(stdout.split("\n")[0]) ++ res = {"fulldocookies": fulldocookies} ++ except (IndexError, ValueError) as e: ++ logging.error(f"Failed to parse fulldocookies count from stdout: {e}") ++ raise ValueError("Failed to parse fulldocookies count from stdout") from e ++ ++ return res ++ ++ ++def fulldrop_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != FullDrop: ++ logging.error("Command is not 'FullDrop'.") ++ raise ValueError("Command is not 'FullDrop'.") ++ ++ if not isinstance(stdout, str): ++ logging.error("Input stdout is not a string.") ++ raise TypeError("Expected stdout to be a string") ++ ++ try: ++ fulldrop = float(stdout.split("\n")[0]) ++ res = {"fulldrop": fulldrop} ++ except (IndexError, ValueError) as e: ++ logging.error(f"Failed to parse fulldrop count from stdout: {e}") ++ raise ValueError("Failed to parse fulldrop count from stdout") from e ++ ++ return res ++ ++ ++def sar_parse( ++ cmd: str, ++ stdout: Any, ++) -> Dict: ++ if cmd != "sar -n DEV 1 1": ++ logging.error("Command is not 'sar -n DEV 1 1'.") ++ raise ValueError("Command is not 'sar -n DEV 1 1'.") ++ return {"网卡指标": stdout} ++ ++ ++NETWORK_PARSE_FUNCTIONS = { ++ ListenOverflows: listenoverflows_parse, ++ FullDoCookies: fulldocookies_parse, ++ FullDrop: fulldrop_parse, ++ "sar -n DEV 1 1": sar_parse, ++} ++ ++ ++class NetworkCollector(BaseCollector): ++ def __init__(self, cmd: List[str], **kwargs): ++ kwargs['cmds'] = cmd ++ super().__init__(**kwargs) ++ ++ def parse_cmd_stdout( ++ self, ++ network_info_stdout: Dict[str, Any], ++ ) -> Dict: ++ parse_result = {} ++ for k, v in network_info_stdout.items(): ++ # 使用字典获取对应的解析函数,如果cmd不在字典中,使用默认的解析函数 ++ parse_function = NETWORK_PARSE_FUNCTIONS.get(k, self.default_parse) ++ cmd_parse_result = parse_function(k, v) ++ parse_result = {**parse_result, **cmd_parse_result} ++ return parse_result ++ ++ def data_process( ++ self, ++ network_parse_result: Dict, ++ ) -> Dict: ++ logging.info(f"[NetworkCollector] collecting network workload metrics") ++ network_process_result = {"listenOverflows": int(network_parse_result["listenOverflows"] > 0), ++ "fulldocookies": int(network_parse_result["fulldocookies"] > 0), ++ "fulldrop": int(network_parse_result["fulldrop"] > 0), ++ "网卡指标": network_parse_result["网卡指标"]} ++ return network_process_result +diff --git a/copilot-tune/src/performance_collector/static_metric_profile_collector.py b/copilot-tune/src/performance_collector/static_metric_profile_collector.py +new file mode 100644 +index 0000000..569d47f +--- /dev/null ++++ b/copilot-tune/src/performance_collector/static_metric_profile_collector.py +@@ -0,0 +1,52 @@ ++import logging ++ ++from src.performance_collector import static_profile_collector ++from src.utils.shell_execute import get_registered_cmd_funcs ++from src.utils.thread_pool import ThreadPoolManager ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++ ++class StaticMetricProfileCollector: ++ def __init__( ++ self, ++ ssh_client, ++ max_workers ++ ): ++ self.ssh_client = ssh_client ++ ++ self.thread_pool = ThreadPoolManager(max_workers=max_workers) ++ self.sequential_pool = [] ++ self._add_tasks( ++ # 获取这些模块所有注册的cmd parser接口,提交到线程池执行 ++ static_profile_collector ++ ) ++ ++ def _add_tasks(self, *args): ++ for module in args: ++ func_info_list = get_registered_cmd_funcs(module) ++ self.thread_pool.add_batch( ++ [(func_info["func"], (self.ssh_client,), {"tag": func_info["tag"]}) for func_info in func_info_list] ++ ) ++ ++ def sequential_tasks(self): ++ pass ++ ++ def run(self): ++ logging.info( ++ "[StaticMetricProfileCollector] collecting static profile data ..." ++ ) ++ parsed_results = {} ++ ++ self.thread_pool.run_all_tasks() ++ task_results = self.thread_pool.get_all_results() ++ ++ for task_result in task_results: ++ if task_result.tag not in parsed_results: ++ parsed_results[task_result.tag] = {} ++ if task_result.result.status_code == 0: ++ parsed_results[task_result.tag].update(task_result.result.output) ++ else: ++ logging.warning(f"error while execute task {task_result.func_name}, err_msg is {task_result.result}") ++ ++ return parsed_results +diff --git a/copilot-tune/src/performance_collector/static_profile_collector.py b/copilot-tune/src/performance_collector/static_profile_collector.py +new file mode 100644 +index 0000000..438d1df +--- /dev/null ++++ b/copilot-tune/src/performance_collector/static_profile_collector.py +@@ -0,0 +1,246 @@ ++import logging ++import re ++ ++from src.utils.shell_execute import cmd_pipeline ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++@cmd_pipeline(cmd="lscpu", tag="static", parallel=True) ++def lscpu_parser(output: str) -> dict: ++ """解析 lscpu 输出:物理/逻辑核心、主频、L3 Cache、NUMA 拓扑""" ++ metrics = {} ++ for line in output.splitlines(): ++ if ":" not in line: ++ continue ++ k, v = [x.strip() for x in line.split(":", 1)] ++ if k == "CPU(s)": ++ metrics["CPU 逻辑核心数量"] = int(v) ++ elif k == "Core(s) per socket": ++ metrics["每个物理 CPU 插槽上的核心数"] = int(v) ++ elif k == "Socket(s)": ++ metrics["物理 CPU 插槽数量"] = int(v) ++ elif k == "CPU MHz": ++ metrics["cpu_mhz"] = float(v) ++ elif k == "L3 cache": ++ # 格式示例:"8192K", "32 MiB (1 instance)" ++ m = re.match(r"(\d+)\s*([KMG]i?B?)", v, re.IGNORECASE) ++ if m: ++ num = int(m.group(1)) ++ unit = m.group(2).upper() ++ if unit in ("K", "KB", "KI", "KIB"): ++ bytes_val = num * 1024 ++ elif unit in ("M", "MB", "MI", "MIB"): ++ bytes_val = num * 1024**2 ++ elif unit in ("G", "GB", "GI", "GIB"): ++ bytes_val = num * 1024**3 ++ else: ++ bytes_val = num ++ metrics["L3 缓存容量(字节)"] = bytes_val ++ elif k == "NUMA node(s)": ++ metrics["NUMA 节点数量"] = int(v) ++ elif k.startswith("NUMA node") and "CPU(s)" in k: ++ key_name = k.lower().replace(" ", "_") ++ metrics[key_name] = v ++ return metrics ++ ++ ++@cmd_pipeline(cmd="free -b", tag="static", parallel=True) ++def free_parser(output: str) -> dict: ++ """解析 `free -b` 输出,并将内存大小转换为 GB""" ++ metrics = {} ++ ++ parts = output.split() ++ if len(parts) >= 2 and parts[0].startswith("Mem"): ++ total_bytes = int(parts[1]) ++ total_gb = total_bytes / (1024**3) ++ metrics["总共内存大小(GB)"] = round(total_gb, 2) # 保留 2 位小数 ++ return metrics ++ ++ ++@cmd_pipeline( ++ cmd="getconf PAGE_SIZE && grep HugePages_ /proc/meminfo", ++ tag="static", ++ parallel=True, ++) ++def page_hugepages_parser(output: str) -> dict: ++ """解析 getconf PAGE_SIZE && grep HugePages_ /proc/meminfo,并返回中文键名""" ++ metrics = {} ++ lines = output.splitlines() ++ if lines: ++ metrics["系统页大小(字节)"] = int(lines[0].strip()) ++ ++ field_map = { ++ "Total": "HugePages 总数", ++ "Free": "HugePages 空闲数", ++ "Rsvd": "HugePages 保留但未使用数", ++ "Surp": "HugePages 超量分配数", ++ } ++ ++ for line in lines[1:]: ++ m = re.match(r"HugePages_(\w+):\s+(\d+)", line) ++ if m: ++ key_en = m.group(1) ++ key_cn = field_map.get(key_en, f"HugePages_{key_en}") ++ metrics[key_cn] = int(m.group(2)) ++ ++ return metrics ++ ++ ++@cmd_pipeline(cmd="lsblk -dn -o NAME,ROTA,TYPE", tag="static", parallel=True) ++def lsblk_parser(output: str) -> dict: ++ """ ++ 解析 `lsblk -dn -o NAME,ROTA,TYPE` 输出,返回中文键名的磁盘类型信息: ++ - ROTA=1 表示旋转盘(HDD),0 表示固态盘(SSD/NVMe) ++ - TYPE=d 表示磁盘设备 ++ """ ++ metrics = {} ++ for line in output.splitlines(): ++ name, rota, typ = line.split() ++ if typ != "d": ++ continue ++ t = "机械硬盘(HDD)" if rota == "1" else "固态硬盘(SSD/NVMe)" ++ metrics[f"磁盘 {name} 类型"] = t ++ return metrics ++ ++ ++@cmd_pipeline(cmd="iostat -dx -k 1 2", tag="static", parallel=True) ++def iostat_parser(output: str) -> dict: ++ """ ++ 解析 iostat -dx -k 1 2,取第二次监测 ++ 指标:单盘 IOPS, 顺/随机 吞吐(KB/s) ++ """ ++ metrics = {} ++ lines = [ ++ l ++ for l in output.splitlines() ++ if l and not l.startswith("Linux") and not l.startswith("avg-cpu") ++ ] ++ # 找到最后一次 block 设备报告开始行 ++ # 格式: Device: rrqm/s wrqm/s r/s w/s ... ++ header_idx = None ++ for i, l in enumerate(lines): ++ if l.startswith("Device"): ++ header_idx = i ++ if header_idx is None: ++ return metrics ++ hdr = re.split(r"\s+", lines[header_idx].strip()) ++ for l in lines[header_idx + 1 :]: ++ cols = re.split(r"\s+", l.strip()) ++ if len(cols) != len(hdr): ++ continue ++ dev = cols[0] ++ data = dict(zip(hdr, cols)) ++ # IOPS ++ metrics[f"{dev}_iops"] = float(data.get("r/s", 0)) + float(data.get("w/s", 0)) ++ # 吞吐 ++ metrics[f"{dev}_读操作吞吐率_kB_s"] = float(data.get("rkB/s", 0)) ++ metrics[f"{dev}_写操作吞吐率_kB_s"] = float(data.get("wkB/s", 0)) ++ return metrics ++ ++ ++@cmd_pipeline( ++ cmd='for d in /sys/block/*/queue/nr_requests; do echo "$d $(cat $d)"; done', ++ tag="static", ++ parallel=True, ++) ++def queue_depth_parser(output: str) -> dict: ++ """解析 /sys/block/*/queue/nr_requests""" ++ metrics = {} ++ for line in output.splitlines(): ++ path, val = line.split() ++ dev = path.split("/")[3] ++ metrics[f"块设备{dev}_队列请求深度"] = int(val) ++ return metrics ++ ++ ++@cmd_pipeline(cmd="cat /proc/mdstat", tag="static", parallel=True) ++def raid_parser(output: str) -> dict: ++ """解析 /proc/mdstat:判断是否存在 md 设备及其 RAID 类型(中文键名)""" ++ metrics = {} ++ for line in output.splitlines(): ++ if line.startswith("md"): ++ parts = line.split() ++ name = parts[0] ++ # 例如:md0 : active raid1 sda1[0] sdb1[1] ++ if "raid" in line: ++ m = re.search(r"raid(\d+)", line) ++ if m: ++ metrics[f"阵列设备 {name} 类型"] = f"RAID{m.group(1)}" ++ return metrics ++ ++ ++@cmd_pipeline(cmd="df -T -x tmpfs -x devtmpfs", tag="static", parallel=True) ++def df_parser(output: str) -> dict: ++ """ ++ 解析 df -T -x tmpfs -x devtmpfs ++ 返回每个挂载点的文件系统类型 ++ """ ++ metrics = {} ++ lines = output.strip().splitlines() ++ if len(lines) < 2: ++ return metrics ++ header = re.split(r"\s+", lines[0].strip()) ++ idx_fs = header.index("Type") ++ idx_mount = header.index("Mounted") ++ for l in lines[1:]: ++ cols = re.split(r"\s+", l.strip()) ++ fs = cols[idx_fs] ++ mnt = cols[idx_mount] ++ metrics[f"fs_{mnt}"] = fs ++ return metrics ++ ++ ++@cmd_pipeline( ++ cmd="ethtool $(ls /sys/class/net | grep -v lo | head -n1)", ++ tag="static", ++ parallel=True, ++) ++def nic_queues_parser(output: str) -> dict: ++ """ ++ 解析 ethtool -l 输出:NIC 多队列信息 ++ """ ++ metrics = {} ++ for line in output.splitlines(): ++ if "Combined:" in line: ++ _, val = line.split(":", 1) ++ metrics["nic_combined_queues"] = int(val.strip()) ++ return metrics ++ ++ ++@cmd_pipeline( ++ cmd="ethtool -l $(ls /sys/class/net | grep -v lo | head -n1)", ++ tag="static", ++ parallel=True, ++) ++def ethtool_speed_parser(output: str) -> dict: ++ """ ++ 解析 ethtool 输出:网络带宽(Speed) ++ """ ++ metrics = {} ++ m = re.search(r"Speed:\s*(\d+)([GM]b/s)", output) ++ if m: ++ metrics["网络速度"] = m.group(1) + m.group(2) ++ return metrics ++ ++ ++@cmd_pipeline(cmd="lspci -vv | grep -i sriov -A5", tag="static", parallel=True) ++def sriov_parser(output: str) -> dict: ++ """ ++ 解析 lspci -vv | grep -i sriov -A5:是否支持 SR-IOV,最大 VF 数 ++ """ ++ metrics = {} ++ for line in output.splitlines(): ++ if "SR-IOV" in line and "Total VFs:" in line: ++ m = re.search(r"Total VFs:\s*(\d+)", line) ++ if m: ++ metrics["nic_sriov_total_vfs"] = int(m.group(1)) ++ return metrics ++ ++ ++@cmd_pipeline(cmd="ulimit -n", tag="static", parallel=True) ++def fdlimit_parser(output: str) -> dict: ++ """解析 ulimit -n 输出:文件描述符上限""" ++ return {"最大文件描述符": int(output.strip())} +diff --git a/copilot-tune/src/performance_optimizer/__init__.py b/copilot-tune/src/performance_optimizer/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/performance_optimizer/base_optimizer.py b/copilot-tune/src/performance_optimizer/base_optimizer.py +new file mode 100644 +index 0000000..5a4421d +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/base_optimizer.py +@@ -0,0 +1,185 @@ ++import logging ++import os ++from abc import abstractmethod ++from typing import Dict, List, Any, Tuple, Optional ++ ++import yaml ++from pydantic import BaseModel ++ ++from src.utils.constant import OPTIMIZE_CONFIG_PATH ++from src.utils.llm import get_llm_response ++from src.utils.shell_execute import SshClient ++ ++# 配置日志 ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++ ++class OptimizerArgs(BaseModel): ++ bottle_neck: str = "" ++ application: str = "" ++ system_report: str = "" ++ ssh_client: Optional[SshClient] = None ++ target_config_path: str = "" ++ ++ model_config = { ++ "arbitrary_types_allowed": True ++ } # ?统一配置文件 ++ ++ ++class BaseOptimizer: ++ def __init__(self, **kwargs): ++ self.args = OptimizerArgs(**kwargs) ++ ++ # 写死的命令->执行->保证执行不出错 ++ # llm生成的脚本->建议 ++ @abstractmethod ++ def think( ++ self, ++ history: List ++ ) -> Tuple[bool, str]: ++ pass ++ ++ @abstractmethod ++ def get_bash_script(self, **kwargs) -> str: ++ pass ++ ++ # 若执行失败,则需要根据报错信息进行修复(todo) ++ def act( ++ self, ++ is_execute: bool, ++ plan: str ++ ) -> bool: ++ if not is_execute: ++ return is_execute ++ try: ++ # 将脚本内容写入临时文件 ++ with open('temp_script.sh', 'w') as file: ++ file.write(plan) ++ ++ # 使脚本文件可执行 ++ chomd_res = self.args.ssh_client.run_cmd(cmd='chmod +x temp_script.sh') ++ ++ # 执行脚本 ++ script_res = self.args.ssh_client.run_cmd(cmd='./temp_script.sh') ++ except Exception as e: ++ print("执行优化脚本时发生错误:", e) ++ return not is_execute ++ finally: ++ # 清理临时文件 ++ remove_res = self.args.ssh_client.run_cmd(cmd='rm temp_script.sh') ++ return is_execute ++ ++ # 如果plan是建议,则通过人的反馈获取优化结果,需要与人交互 ++ # 如果plan是执行,则自动化获取优化结果(需要配置获取优化结果的方法),不需要与人交互 ++ # 返回值是观察结果 ++ def observe( ++ self, ++ is_execute: bool, ++ plan: str ++ ) -> Dict: ++ if not is_execute: ++ human_response = self.get_human_response(plan=plan) ++ prompt = f""" ++ # CONTEXT # ++ 以下内容是用户基于调优结果的反馈: ++ {human_response} ++ ++ # OBJECTIVE # ++ 请根据以上信息,判断用户的性能优化目标是否达到。 ++ ++ # STYLE # ++ 你是一个专业的系统运维专家,你只用回答True或False ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请认真思考后给出你的答案。 ++ ++ # RESPONSE FORMAT # ++ 请回答True或False,不要有多余文字。 ++ ++ """ ++ if "true" in get_llm_response(prompt=prompt).lower(): ++ return { ++ "isfinished": True, ++ "reason": human_response ++ } ++ else: ++ return { ++ "isfinished": False, ++ "reason": human_response ++ } ++ isfinished, response = self.get_optimize_result(config_path=self.args.target_config_path) ++ if isfinished: ++ return { ++ "isfinished": True, ++ "reason": response ++ } ++ else: ++ return { ++ "isfinished": False, ++ "reason": response ++ } ++ ++ # 从配置文件中获取如何得到优化结果的方法(todo) ++ # 配置文件需要写明如何获得优化结果、优化结果如何判断是否满足用户需求(最好是公式,仿照A-Tune) ++ def get_optimize_result( ++ self, ++ config_path: str ++ ) -> Tuple[bool, str]: ++ pass ++ ++ # fake human feedback tmp(todo) ++ def get_human_response( ++ self, ++ plan: str ++ ) -> str: ++ ask_human = f"here is my advice: {plan}, please try this plan and let me know how it works out. You need to tell me if the tuning plan meet your requirement. if not, the more detailed information you provide, the better it helps me make a improvement." ++ human_response = "yeh, it works out, this solution has already taken effect and met my goals." ++ return human_response ++ ++ def get_tuning_config( ++ self, ++ ) -> Dict[str, Any]: ++ current_file_path = os.path.abspath(__file__) ++ current_dir_path = os.path.dirname(current_file_path) ++ config_file = os.path.join(current_dir_path, '..', '..', 'config', 'optimize_config.yaml') ++ if not os.path.exists(config_file) or not os.path.isfile(config_file): ++ config_file = OPTIMIZE_CONFIG_PATH ++ try: ++ with open(config_file, "r") as f: ++ tuning_config = yaml.safe_load(f) ++ return tuning_config ++ except Exception as e: ++ logging.error(f"Failed to parse optimize_config.yaml: {e}") ++ ++ # 诊断的loop无需借助优化的结果,maybe <= 3(todo) ++ def run(self) -> Any: ++ optimization_plan = "" ++ optimization_feedback = { ++ "isfinished": False, ++ "reason": "" ++ } ++ isfinished = False ++ rounds = 1 ++ record = [] ++ while not isfinished: ++ is_execute, optimization_plan = self.think(history=record) ++ is_execute = self.act(is_execute=is_execute, plan=optimization_plan) ++ optimization_feedback = self.observe(is_execute=is_execute, plan=optimization_plan) ++ isfinished = optimization_feedback["isfinished"] ++ if isfinished: ++ record.append( ++ f"in '{rounds}'th round, the optimization plan is '{optimization_plan}', and the tuning task has been finished, the reason is: '{optimization_feedback['reason']}'") ++ logging.info( ++ f"in '{rounds}'th round, the optimization plan is '{optimization_plan}', and the tuning task has been finished, the reason is: '{optimization_feedback['reason']}'") ++ else: ++ record.append( ++ f"in '{rounds}'th round, the optimization plan is '{optimization_plan}', and the tuning task has not been finished, the reason is: '{optimization_feedback['reason']}'") ++ logging.info( ++ f"in '{rounds}'th round, the optimization plan is '{optimization_plan}', and the tuning task has not been finished, the reason is: '{optimization_feedback['reason']}'") ++ rounds += 1 ++ if rounds > 5: ++ break ++ return optimization_plan, isfinished, optimization_feedback["reason"] +diff --git a/copilot-tune/src/performance_optimizer/knob_optimizer.py b/copilot-tune/src/performance_optimizer/knob_optimizer.py +new file mode 100644 +index 0000000..f3dbf6d +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/knob_optimizer.py +@@ -0,0 +1,132 @@ ++import json ++import logging ++import os ++from typing import List, Tuple ++ ++from src.utils.rag.knob_rag import KnobRag ++from .base_optimizer import BaseOptimizer ++from ..utils.constant import KNOB_RAG_CONFIG_PATH ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++CORE_CPU_KNOBS = [ ++ "kernel.numa_balancing", ++ "kernel.sched_autogroup_enabled", ++ "kernel.sched_wakeup_granularity_ns", ++ "kernel.sched_min_granularity" ++] ++COER_MEMORY_KNOBS = [] ++CORE_DISK_KNOBS = [] ++CORE_NETWORK_KNOBS = [] ++CORE_MYSQL_KNOBS = [ ++ "mysql.innodb_thread_concurrency", ++ "mysql.innodb_lru_scan_depth", ++ "mysql.innodb_flush_log_at_trx_commit", ++ "mysql.innodb_spin_wait_delay", ++ "mysql.innodb_log_buffer_size", ++ "mysql.sync_binlog", ++ "mysql.innodb_sync_spin_loops", ++ "mysql.innodb_write_io_threads", ++ "mysql.innodb_read_io_threads", ++ "mysql.innodb_purge_threads", ++ "mysql.innodb_buffer_pool_instances", ++] ++CORE_NGINX_KNOBS = [] ++CORE_REDIS_KNOBS = [] ++ ++ ++class KnobOptimizer(BaseOptimizer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ ++ # 根据有无history区分静态调优和动态调优? ++ # 基于动态调优的待实现(todo) ++ # 基于历史的重新推荐,可以提炼成一个函数 ++ # 当瓶颈为none时,怎么处理?是否应该不执行优化? ++ def think( ++ self, ++ history: List ++ ) -> Tuple[bool, str]: ++ tuning_config = self.get_tuning_config() ++ if tuning_config["knob_tuning"] == "static": ++ if history == []: ++ core_system_knob = self.get_core_system_knob() ++ core_app_knob = self.get_core_app_knob() ++ current_file_path = os.path.abspath(__file__) ++ current_dir_path = os.path.dirname(current_file_path) ++ rag_config_path = os.path.join(current_dir_path, '..', '..', 'config', 'knob_rag_config.json') ++ if not os.path.exists(rag_config_path) or not os.path.isfile(rag_config_path): ++ rag_config_path = KNOB_RAG_CONFIG_PATH ++ rag = KnobRag(config_path=rag_config_path, bottle_neck=self.args.bottle_neck, ++ application=self.args.application, system_report=self.args.system_report) ++ knobs = rag.run() ++ knobs.extend([knob for knob in core_system_knob if knob not in knobs]) ++ # todo 当用户输入mysql时,但mysql实际没有运行,则其实不应该把这些参数进行添加。 ++ knobs.extend([knob for knob in core_app_knob if knob not in knobs]) ++ ++ set_knob_config_path = os.path.join(current_dir_path, 'set_knob_cmd.jsonl') ++ set_knob_cmd = {} ++ with open(set_knob_config_path, "r", encoding="utf-8") as f: ++ for line in f.readlines(): ++ set_knob_cmd = set_knob_cmd | json.loads(line) ++ cmd_list = [] ++ for knob in knobs: ++ if knob in set_knob_cmd: ++ cmd_list.append(set_knob_cmd[knob]) ++ return False, self.get_bash_script(cmd_list) ++ else: ++ pass ++ elif tuning_config["knob_tuning"] == "dynamic": ++ pass ++ else: ++ # 异常处理(todo) ++ pass ++ ++ def get_bash_script( ++ self, ++ cmd_list: List ++ ) -> str: ++ # 脚本内容的开头部分 ++ script_header = ( ++ "#!/bin/bash\n\n" ++ "echo 'starting set parameters value...'\n" ++ ) ++ ++ # 将命令列表转换为脚本中的行 ++ commands_str = "\n".join(cmd_list) + "\n" ++ ++ # 脚本内容的结尾部分 ++ script_footer = ( ++ "\necho 'set parameters value done!'\n" ++ ) ++ ++ script_content = script_header + commands_str + script_footer ++ return script_content ++ ++ def get_core_system_knob( ++ self, ++ ) -> List[str]: ++ if self.args.bottle_neck.upper() == "CPU": ++ return CORE_CPU_KNOBS ++ elif self.args.bottle_neck.upper() == "MEMORY": ++ return COER_MEMORY_KNOBS ++ elif self.args.bottle_neck.upper() == "DISK": ++ return CORE_DISK_KNOBS ++ elif self.args.bottle_neck.upper() == "NETWORK": ++ return CORE_NETWORK_KNOBS ++ else: ++ return [] ++ ++ def get_core_app_knob( ++ self, ++ ) -> List[str]: ++ if not self.args.application: ++ return [] ++ if self.args.application.upper() == "MYSQL": ++ return CORE_MYSQL_KNOBS ++ elif self.args.application.upper() == "NGINX": ++ return CORE_NGINX_KNOBS ++ elif self.args.application.upper() == "REDIS": ++ return CORE_REDIS_KNOBS ++ else: ++ return [] +diff --git a/copilot-tune/src/performance_optimizer/param_knowledge.py b/copilot-tune/src/performance_optimizer/param_knowledge.py +new file mode 100644 +index 0000000..1d2128f +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/param_knowledge.py +@@ -0,0 +1,117 @@ ++import logging ++import threading ++from typing import Iterable ++ ++from tqdm import tqdm ++ ++from src.utils.config.app_config import AppInterface ++from src.utils.config.global_config import param_config ++from src.utils.shell_execute import SshClient ++ ++ ++class ParamKnowledge: ++ _instance = None ++ _lock = threading.Lock() ++ ++ def __new__(cls, ++ ssh_client: SshClient, ++ tune_system_param: bool = False, ++ tune_app_param: bool = True): ++ if not cls._instance: ++ with cls._lock: ++ if not cls._instance: ++ cls._instance = super(ParamKnowledge, cls).__new__(cls) ++ cls._instance.param_config = param_config ++ cls._instance.ssh_client = ssh_client # 保存 ssh_client ++ cls._instance.tune_system_param = tune_system_param ++ cls._instance.tune_app_param = tune_app_param ++ return cls._instance ++ ++ def __init__(self, ++ ssh_client: SshClient, ++ tune_system_param: bool = False, ++ tune_app_param: bool = True): ++ logging.info(f"[ParamKnowledge] initializing param knowledge base ...") ++ # 防止重复初始化 ++ if not hasattr(self, "tune_system_param"): ++ self.tune_system_param = tune_system_param ++ if not hasattr(self, "tune_app_param"): ++ self.tune_app_param = tune_app_param ++ if not hasattr(self, "ssh_client"): ++ self.ssh_client = ssh_client ++ ++ def get_params(self, app_name): ++ # check 应用和系统参数是否有重名的 ++ logging.info(f"[ParamKnowledge] checking params ...") ++ system_params = set() ++ app_params = set() ++ all_params = [] ++ if self.tune_system_param: ++ system_params = set(self.param_config.get("system", {}).keys()) ++ all_params += list(self.param_config.get("system").keys()) ++ if self.tune_app_param: ++ app_params = set(self.param_config.get(app_name, {}).keys()) ++ all_params += list(self.param_config.get(app_name).keys()) ++ union_params = system_params & app_params ++ if union_params: ++ raise RuntimeError( ++ f"Duplicate keys ({union_params}) detected between application parameters and system parameters." ++ ) ++ return all_params ++ ++ def describe_param_background_knob(self, app_name: str, params: Iterable): ++ logging.info(f"[ParamKnowledge] building param knowledge base ...") ++ params_describe_list = [] ++ app_params = self.param_config.get(app_name.lower()) ++ system_params = self.param_config.get("system") ++ app = AppInterface(self.ssh_client).get(app_name) ++ for param_name in tqdm(params): ++ item = ( ++ app_params.get(param_name) ++ if param_name in app_params ++ else system_params.get(param_name) ++ ) ++ if not item: ++ print(f"param {param_name} not in app param or system param") ++ continue ++ # 1.描述参数范围 ++ if item["range"]: ++ if item["type"] == "discrete": ++ param_range = "、".join(list(map(str, item["range"]))) ++ else: ++ param_range = f"从{item['range'][0]}到{item['range'][1]}" ++ else: ++ param_range = None ++ # 2.当前环境取值 ++ param_result = app.get_param(param_name=param_name) ++ param_env_value = ( ++ param_result.output if param_result.status_code == 0 else "默认值" ++ ) ++ params_describe_list.append( ++ f"{param_name}:{item['desc']},参数数据类型为:{item['dtype']},参数的取值范围是:{param_range}, 当前环境取值为:{param_env_value}" ++ ) ++ logging.info(f"[ParamKnowledge] initialize param knowledge base finished!") ++ return params_describe_list ++ ++ ++if __name__ == "__main__": ++ class Result: ++ def __init__(self, status_code, output): ++ self.status_code = status_code ++ self.output = output ++ ++ ++ class SshClient: ++ def __init__(self): ++ pass ++ ++ def run_cmd(self, cmd): ++ return Result(0, "12") ++ ++ ++ ssh_client = SshClient() ++ param_knowledge = ParamKnowledge(ssh_client) ++ res = param_knowledge.describe_param_background_knob( ++ "mysql", ["innodb_adaptive_flushing"] ++ ) ++ print(res) +diff --git a/copilot-tune/src/performance_optimizer/param_optimizer.py b/copilot-tune/src/performance_optimizer/param_optimizer.py +new file mode 100644 +index 0000000..8da3e21 +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/param_optimizer.py +@@ -0,0 +1,215 @@ ++import logging ++ ++from src.performance_optimizer.param_recommender import ParamRecommender ++from src.performance_test.pressure_test import wait_for_pressure_test ++from src.utils.config.app_config import AppInterface ++from src.utils.shell_execute import SshClient ++ ++# 配置日志 ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++class ParamOptimizer: ++ ++ def __init__( ++ self, ++ service_name: str, ++ slo_goal: float, ++ analysis_report: str, ++ static_profile: str, ++ ssh_client: SshClient, ++ slo_calc_callback: callable, ++ max_iterations: int = 10, ++ need_restart_application: bool = False, ++ pressure_test_mode: bool = False, ++ tune_system_param: bool = False, ++ tune_app_param: bool = True, ++ need_recover_cluster: bool = False, ++ benchmark_timeout: int = 3600 ++ ): ++ self.service_name = service_name ++ self.analysis_report = analysis_report ++ self.static_profile = static_profile ++ self.ssh_client = ssh_client ++ self.pressure_test_mode = pressure_test_mode ++ self.max_iterations = max_iterations ++ # 计算slo指标提升方式的回调函数,输入是benchmark返回的性能指标,输出是业务性能提升比例 ++ self.slo_calc_callback = slo_calc_callback ++ # 业务预期指标提升的目标 ++ self.slo_goal = slo_goal ++ # 应用接口,包括应用参数下发、benchmark执行等操作 ++ self.app_interface = AppInterface(ssh_client).get(service_name) ++ self.system_interface = AppInterface(ssh_client).system ++ self.need_restart_application = need_restart_application ++ self.need_recover_cluster = need_recover_cluster ++ self.param_recommender = ParamRecommender( ++ service_name=service_name, ++ slo_goal=slo_goal, ++ performance_metric=self.app_interface.performance_metric, ++ static_profile=static_profile, ++ performance_analysis_report=analysis_report, ++ ssh_client=ssh_client, ++ tune_system_param=tune_system_param, ++ tune_app_param=tune_app_param ++ ) ++ self.first_restart_save = True ++ self.benchmark_timeout=benchmark_timeout ++ ++ def calc_improve_rate(self, baseline, benchmark_result, symbol): ++ return self.slo_calc_callback(baseline, benchmark_result, symbol) ++ ++ def reached_goal(self, baseline, benchmark_result, symbol): ++ if self.calc_improve_rate(baseline, benchmark_result, symbol) >= self.slo_goal: ++ return True ++ return False ++ ++ def benchmark(self): ++ logging.info("🔄 正在验证benchmark性能...") ++ result = self.app_interface.benchmark() ++ if result.status_code == 0 and result.output: ++ return float(result.output) ++ else: ++ raise RuntimeError(f"failed to execute benchmark because {result.err_msg}") ++ ++ def apply_params(self, recommend_params): ++ for param_name, param_value in recommend_params.items(): ++ apply_result = self.app_interface.set_param(param_name, param_value) ++ if apply_result.status_code == 0: ++ logging.info(f"设置参数{param_name}为{param_value}") ++ else: ++ logging.info(f"设置参数{param_name}失败,原因是:{apply_result.err_msg}") ++ ++ def restart_application(self): ++ logging.info("🔄 正在重启应用 ...") ++ stop_result = self.app_interface.stop_workload() ++ if stop_result.status_code != 0: ++ raise RuntimeError( ++ f"failed to stop application because {stop_result.err_msg}" ++ ) ++ start_result = self.app_interface.start_workload() ++ if start_result.status_code != 0: ++ raise RuntimeError( ++ f"failed to start application because {start_result.err_msg}" ++ ) ++ ++ def recover_cluster(self): ++ print("🔄 正在恢复集群 ...") ++ recover_result = self.app_interface.recover_workload() ++ if recover_result.status_code != 0: ++ raise RuntimeError( ++ f"failed to recover cluster because {recover_result.err_msg}" ++ ) ++ ++ def save_restart_params_to_script(self, recommend_params, script_path, batch_id): ++ """ ++ 将推荐参数保存到脚本中(仅在调优过程中需要重置参数的情况使用) ++ """ ++ ++ commands = [] ++ for param_name, param_value in recommend_params.items(): ++ cmd = self.app_interface.generate_set_command(param_name, param_value) ++ if cmd: ++ commands.append(cmd) ++ ++ if not commands: ++ print(f"第 {batch_id} 轮无需要重启生效的参数,跳过写入脚本。") ++ return ++ ++ # 构建要追加的内容 ++ batch_header = f"\n# 批次 {batch_id} - 重启后生效参数\n" ++ content = batch_header + '\n'.join(commands) ++ ++ if self.first_restart_save: ++ init_cmd = f"echo '#!/bin/bash' > {script_path}" ++ self.ssh_client.run_cmd(init_cmd) ++ self.first_restart_save = False ++ print(f"首次创建重启参数脚本: {script_path}") ++ ++ append_cmd = f"cat << 'EOF' >> {script_path}\n{content}\nEOF" ++ self.ssh_client.run_cmd(append_cmd) ++ ++ print(f"已将 {len(commands)} 个参数写入重启脚本: {script_path}") ++ ++ def run(self): ++ # 运行benchmark,摸底参数性能指标 ++ if self.pressure_test_mode: ++ logging.info(f"[ParamOptimizer] waiting for pressure test finished ...") ++ pressure_test_result = wait_for_pressure_test(timeout=self.benchmark_timeout) ++ ++ if pressure_test_result.status_code != 0: ++ raise RuntimeError( ++ f"[ParamOptimizer] failed to run pressure test, err msg is {pressure_test_result.err_msg}" ++ ) ++ ++ baseline = float(pressure_test_result.output.output) ++ logging.info( ++ f"[ParamOptimizer] pressure test finished, baseline is {baseline}" ++ ) ++ else: ++ baseline = self.benchmark() ++ # 保存每轮调优的结果,反思调优目标是否达到 ++ historys = { ++ "历史最佳结果": {}, ++ "历史最差结果": {}, ++ "上一轮调优结果": {} ++ } ++ best_result = baseline ++ worst_result = baseline ++ is_positive = True ++ symbol = self.app_interface.get_calculate_type() ++ logging.info( ++ f"[{0}/{self.max_iterations}] 性能基线是:{baseline}" ++ ) ++ ++ for i in range(self.max_iterations): ++ # 未达成目标的情况下,根据调优结果与历史最优的参数,执行参数调优推荐,给出参数名和参数值 ++ recommend_params = self.param_recommender.run(history_result=historys, is_positive=is_positive) ++ ++ # 设置参数生效 ++ self.apply_params(recommend_params) ++ if self.need_restart_application: ++ self.restart_application() ++ ++ # 执行benchmark,反馈调优结果 ++ performance_result = self.benchmark() ++ if self.need_recover_cluster: ++ # 保存在一个/tmp目录下的脚本中 ++ script_path = '/tmp/euler-copilot-params.sh' ++ self.save_restart_params_to_script(recommend_params, script_path, i + 1) ++ self.recover_cluster() ++ ++ if performance_result * symbol < baseline: ++ is_positive = False ++ else: ++ is_positive = True ++ ++ if performance_result * symbol > best_result * symbol: ++ best_result = performance_result ++ best_history = {"最佳性能": performance_result, "参数推荐": recommend_params} ++ historys["历史最佳结果"] = best_history ++ ++ if performance_result * symbol < worst_result * symbol: ++ worst_result = performance_result ++ worst_history = {"最差性能": performance_result, "参数推荐": recommend_params} ++ historys["历史最差结果"] = worst_history ++ ++ historys["上一轮调优结果"] = {"上一轮性能": performance_result, "参数推荐": recommend_params} ++ ++ ratio = self.calc_improve_rate(baseline, performance_result, symbol) ++ ++ # 达到预期效果,则退出循环 ++ if self.reached_goal(baseline, performance_result, symbol): ++ logging.info( ++ f"[{i + 1}/{self.max_iterations}] 性能基线是:{baseline}, 最佳结果:{best_result}, 本轮结果:{performance_result if performance_result is not None else '-'}, 性能提升:{ratio:.2%}" ++ ) ++ break ++ ++ logging.info( ++ f"[{i + 1}/{self.max_iterations}] 性能基线是:{baseline}, 最佳结果:{best_result}, 本轮结果:{performance_result if performance_result is not None else '-'}, 性能提升:{ratio:.2%}" ++ ) ++ ++ logging.info( ++ f"调优完毕,{'达到' if self.reached_goal(baseline, best_result, symbol) else '未达到'} 预期目标" ++ ) +diff --git a/copilot-tune/src/performance_optimizer/param_recommender.py b/copilot-tune/src/performance_optimizer/param_recommender.py +new file mode 100644 +index 0000000..bbc4556 +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/param_recommender.py +@@ -0,0 +1,177 @@ ++import logging ++ ++from src.performance_analyzer.performance_analyzer import PerformanceAnalyzer ++from src.performance_collector.metric_collector import MetricCollector ++from src.performance_collector.static_metric_profile_collector import ( ++ StaticMetricProfileCollector, ++) ++from src.performance_optimizer.param_knowledge import ParamKnowledge ++from src.utils.json_repair import json_repair ++from src.utils.llm import get_llm_response ++from src.utils.metrics import PerformanceMetric ++from src.utils.shell_execute import SshClient ++from src.utils.thread_pool import thread_pool_manager ++ ++# 配置日志 ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++class ParamRecommender: ++ ++ def __init__( ++ self, ++ service_name: str, ++ slo_goal: float, ++ performance_metric: PerformanceMetric, ++ static_profile: str, ++ performance_analysis_report: str, ++ chunk_size=20, ++ ssh_client=None, ++ tune_system_param: bool = False, ++ tune_app_param: bool = True ++ ): ++ # 待调优app名称 ++ self.service_name = service_name ++ # 业务性能调优目标,指标提升百分比 ++ self.slo_goal = slo_goal ++ # 性能指标介绍 ++ self.performance_metric = performance_metric ++ # 静态指标 ++ self.static_profile = "\n".join(f"{k}: {v}" for k, v in static_profile.items()) ++ # 可调参数知识库,用于给大模型描述应用参数背景知识 ++ self.param_knowledge = ParamKnowledge( ++ ssh_client=ssh_client, ++ tune_system_param=tune_system_param, ++ tune_app_param=tune_app_param ++ ) ++ self.all_params = self.param_knowledge.get_params(service_name) ++ self.ssh_client = ssh_client ++ self.params_set = self.param_knowledge.describe_param_background_knob( ++ service_name, self.all_params ++ ) ++ self.chunk_size = chunk_size ++ self.performance_analysis_report = performance_analysis_report ++ ++ def _process_chunk(self, history_result, cur_params_set, is_positive): ++ recommend_prompt = f""" ++ # CONTEXT # ++ 本次性能优化的目标为: ++ 性能指标为{self.performance_metric.name}, 该指标的含义为:{self.performance_metric.value},目标是提升{self.slo_goal:.2%} ++ 性能分析报告: ++ {self.performance_analysis_report} ++ 你可以分析的参数有: ++ {",".join(cur_params_set)} ++ # OBJECTIVE # ++ 你是一个专业的系统运维专家,当前性能指标未达到预期,请你基于以上性能分析报告分析有哪些调优思路。 ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请认真思考后给出你的答案。 ++ """ ++ optimized_idea = get_llm_response(recommend_prompt) ++ recommended_params = self.recommend( ++ history_result, optimized_idea, cur_params_set, is_positive ++ ) ++ ++ recommended_params_set = json_repair(recommended_params) ++ ++ result = {} ++ for param_name, param_value in recommended_params_set.items(): ++ if param_name in self.all_params: ++ result[param_name] = param_value ++ return result ++ ++ def run(self, history_result, is_positive=True): ++ resultset = {} ++ ++ for i in range(0, len(self.params_set), self.chunk_size): ++ cur_params_set = self.params_set[i: i + self.chunk_size] ++ # 提交任务给线程池,返回 future-like 对象(你线程池需要支持这个) ++ thread_pool_manager.add_task( ++ self._process_chunk, history_result, cur_params_set, is_positive ++ ) ++ ++ thread_pool_manager.run_all_tasks() ++ task_results = thread_pool_manager.get_all_results() ++ ++ for task_result in task_results: ++ if task_result.status_code != 0: ++ raise RuntimeError( ++ f"failed to execute task {task_result.func_name}, exception is {task_result.result}" ++ ) ++ resultset.update(task_result.result) ++ ++ return resultset ++ ++ def recommend(self, history_result, optimization_idea, cur_params_set, is_positive): ++ history_result = str(history_result) if history_result else "无" ++ params_set_str = "\n".join(cur_params_set) ++ if is_positive: ++ prompt = f""" ++ 你是专业的系统运维专家。当前性能指标未达预期,但上一轮调优为正向结果(性能提升或无退化)。 ++ 请在“心中完成推理”,只输出最终 JSON;除 JSON 以外不要输出任何文字、代码块或注释。 ++ ++ 目标:基于以下信息,在保持上轮有效方向的前提下,总结参数调整经验,进一步微调参数(在安全边界内适度加大力度),仅给出需要变更的参数与推荐新值。 ++ ++ 当前环境配置信息: ++ {self.static_profile} ++ ++ 历史调优信息(包含已修改参数与结果): ++ {history_result} ++ ++ 调优思路: ++ {optimization_idea} ++ ++ 可调整参数全集(含类型/范围/枚举/默认值等)以及baseline对应的的取值为: ++ {params_set_str} ++ ++ 严格规则(务必遵守): ++ 1) 仅输出与当前配置相比“需要变化”的参数;不相关或无收益的参数不要输出。 ++ 2) 优先沿“上轮有效”的方向小步前进:连续型参数按原步长的 100%~150% 微增(通常为 +10%~+30%),离散/枚举取更激进且仍在安全范围的相邻档位;避免一次性过大变更(单参数变更幅度不超过 2 倍或 ±30%,取更严格者)。 ++ 3) 不要动已证明对性能“无影响”的参数;避免同时调整明显互斥的参数。 ++ 4) 必须满足依赖/互斥/上限下限/类型与单位要求;数值默认单位为“字节”。若数值后带单位,请以字符串表示(如 "512MB")。 ++ 5) 每个参数的推荐值必须可被系统实际接受并确保应用可启动。 ++ 6) 若无合适变更,输出空json对象。 ++ ++ 输出格式(必须严格遵守): ++ - 仅输出一个 JSON 对象,键为“可调参数名称”,值为“推荐取值”。 ++ - 不要输出任何多余文字、说明、示例、代码围栏或注释。 ++ """ ++ ++ else: ++ prompt = f""" ++ 你是专业的系统运维专家。当前性能指标未达预期,且上一轮调优为负向结果(性能下降/不稳定/报错等)。 ++ 请在“心中完成推理”,只输出最终 JSON;除 JSON 以外不要输出任何文字、代码块或注释。 ++ ++ 目标:基于以下信息,总结历史调优经验中的baseline、最佳调优结果、最差调优结果以及上一轮调优结果以及参数取值,反向微调上轮可能导致退化的参数,并选择更保守且安全的值;仅给出需要变更的参数与推荐新值。 ++ ++ 当前环境配置信息: ++ {self.static_profile} ++ ++ 历史调优信息(包含已修改参数与结果): ++ {history_result} ++ ++ 调优思路: ++ {optimization_idea} ++ ++ 可调整参数全集(含类型/范围/枚举/默认值等)以及baseline对应的的取值为: ++ {params_set_str} ++ ++ 严格规则(务必遵守): ++ 1) 仅输出与当前配置相比“需要变化”的参数;不相关或无收益的参数不要输出。 ++ 2) 对上轮参与变更且疑似致退化的参数:沿“相反方向”小步调整(幅度为上轮步长的 30%~50%,通常为 -10%~-20%);必要时关闭可选的高开销特性。 ++ 3) 避免一次调整过多参数;不要同时调整互斥参数;优先选择风险更低的修正方案。 ++ 4) 必须满足依赖/互斥/上限下限/类型与单位要求;数值默认单位为“字节”。若数值后带单位,请以字符串表示(如 "1GB")。 ++ 5) 每个参数的推荐值必须可被系统实际接受并确保应用可启动。 ++ 6) 若无合适变更,输出空json对象。 ++ ++ 输出格式(必须严格遵守): ++ - 仅输出一个 JSON 对象,键为“可调参数名称”,值为“推荐取值”。 ++ - 不要输出任何多余文字、说明、示例、代码围栏或注释。 ++ """ ++ ++ response = get_llm_response(prompt) ++ return response ++ +diff --git a/copilot-tune/src/performance_optimizer/set_knob_cmd.jsonl b/copilot-tune/src/performance_optimizer/set_knob_cmd.jsonl +new file mode 100644 +index 0000000..5f5cd50 +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/set_knob_cmd.jsonl +@@ -0,0 +1,64 @@ ++{"kernel.sched_child_runs_first":"sysctl -w kernel.sched_child_runs_first=0"} ++{"kernel.sched_latency_ns":"sysctl -w kernel.sched_latency_ns=24000000"} ++{"kernel.sched_tunable_scaling":"sysctl -w kernel.sched_tunable_scaling=1"} ++{"kernel.sched_migration_cost_ns":"sysctl -w kernel.sched_migration_cost_ns=500000"} ++{"kernel.sched_nr_migrate":"sysctl -w kernel.sched_nr_migrate=32"} ++{"kernel.sched_min_granularity_ns":"sysctl -w kernel.sched_min_granularity_ns=10000000"} ++{"kernel.sched_wakeup_granularity_ns":"sysctl -w kernel.sched_wakeup_granularity_ns=15000000"} ++{"kernel.shmall":"sysctl -w kernel.shmall=18446744073692774399"} ++{"kernel.shmmax":"sysctl -w kernel.shmmax=18446744073692774399"} ++{"kernel.shmmni":"sysctl -w kernel.shmmni=4096"} ++{"kernel.sched_cfs_bandwidth_slice_us":"sysctl -w kernel.sched_cfs_bandwidth_slice_us=5000"} ++{"kernel.sched_rt_runtime_us":"sysctl -w kernel.sched_rt_runtime_us=950000"} ++{"kernel.sched_autogroup_enabled":"sysctl -w kernel.sched_autogroup_enabled=0"} ++{"vm.dirty_background_ratio":"sysctl -w vm.dirty_background_ratio=10"} ++{"vm.dirty_background_bytes":"sysctl -w vm.dirty_background_bytes=0"} ++{"vm.dirty_ratio":"sysctl -w vm.dirty_ratio=30"} ++{"vm.dirty_bytes":"sysctl -w vm.dirty_bytes=0"} ++{"vm.min_free_kbytes":"sysctl -w vm.min_free_kbytes=90112"} ++{"vm.max_map_count":"sysctl -w vm.max_map_count=65530"} ++{"vm.dirty_expire_centisecs":"sysctl -w vm.dirty_expire_centisecs=3000"} ++{"vm.dirty_writeback_centisecs":"sysctl -w vm.dirty_writeback_centisecs=500"} ++{"vm.swappiness":"sysctl -w vm.swappiness=30"} ++{"vm.vfs_cache_pressure":"sysctl -w vm.vfs_cache_pressure=100"} ++{"vm.watermark_scale_factor":"sysctl -w vm.watermark_scale_factor=10"} ++{"vm.stat_interval":"sysctl -w vm.stat_interval=1"} ++{"vm.overcommit_ratio":"sysctl -w vm.overcommit_ratio=50"} ++{"vm.zone_reclaim_mode":"sysctl -w vm.zone_reclaim_mode=0"} ++{"vm.drop_caches":"sysctl -w vm.drop_caches=0"} ++{"vm.numa_stat":"sysctl -w vm.numa_stat=1"} ++{"net.ipv4.tcp_thin_linear_timeouts":"sysctl -w net.ipv4.tcp_thin_linear_timeouts=0"} ++{"net.core.busy_poll":"sysctl -w net.core.busy_poll=0"} ++{"net.core.netdev_budget":"sysctl -w net.core.netdev_budget=300"} ++{"net.ipv4.ip_local_port_range":"sysctl -w net.ipv4.ip_local_port_range='32768 60999'"} ++{"net.core.somaxconn":"sysctl -w net.core.somaxconn=4096"} ++{"net.core.rps_sock_flow_entries":"sysctl -w net.core.rps_sock_flow_entries=0"} ++{"net.ipv4.tcp_max_tw_buckets":"sysctl -w net.ipv4.tcp_max_tw_buckets=16384"} ++{"net.ipv4.tcp_fin_timeout":"sysctl -w net.ipv4.tcp_fin_timeout=60"} ++{"net.ipv4.tcp_sack":"sysctl -w net.ipv4.tcp_sack=1"} ++{"net.ipv4.tcp_tw_reuse":"sysctl -w net.ipv4.tcp_tw_reuse=2"} ++{"net.ipv4.tcp_max_syn_backlog":"sysctl -w net.ipv4.tcp_max_syn_backlog=256"} ++{"net.ipv4.tcp_synack_retries":"sysctl -w net.ipv4.tcp_synack_retries=5"} ++{"mysql.innodb_io_capacity":"sed -i 's/^innodb_io_capacity.*$/innodb_io_capacity=996/g' /etc/my.cnf"} ++{"mysql.innodb_thread_concurrency":"sed -i 's/^innodb_thread_concurrency.*$/innodb_thread_concurrency=238/g' /etc/my.cnf"} ++{"mysql.innodb_lru_scan_depth":"sed -i 's/^innodb_lru_scan_depth.*$/innodb_lru_scan_depth=285/g' /etc/my.cnf"} ++{"mysql.innodb_adaptive_hash_index":"sed -i 's/^innodb_adaptive_hash_index.*$/innodb_adaptive_hash_index=on/g' /etc/my.cnf"} ++{"mysql.innodb_max_dirty_pages_pct":"sed -i 's/^innodb_max_dirty_pages_pct.*$/innodb_max_dirty_pages_pct=90/g' /etc/my.cnf"} ++{"mysql.innodb_buffer_pool_instances":"sed -i 's/^innodb_buffer_pool_instances.*$/innodb_buffer_pool_instances=64/g' /etc/my.cnf"} ++{"mysql.innodb_flush_log_at_trx_commit":"sed -i 's/^innodb_flush_log_at_trx_commit.*$/innodb_flush_log_at_trx_commit=0/g' /etc/my.cnf"} ++{"mysql.innodb_adaptive_max_sleep_delay":"sed -i 's/^innodb_adaptive_max_sleep_delay.*$/innodb_adaptive_max_sleep_delay=150000/g' /etc/my.cnf"} ++{"mysql.innodb_spin_wait_delay":"sed -i 's/^innodb_spin_wait_delay.*$/innodb_spin_wait_delay=7/g' /etc/my.cnf"} ++{"mysql.innodb_log_buffer_size":"sed -i 's/^innodb_log_buffer_size.*$/innodb_log_buffer_size=449536/g' /etc/my.cnf"} ++{"mysql.thread_cache_size":"sed -i 's/^thread_cache_size.*$/thread_cache_size=9/g' /etc/my.cnf"} ++{"mysql.innodb_write_io_threads":"sed -i 's/^innodb_write_io_threads.*$/innodb_write_io_threads=24/g' /etc/my.cnf"} ++{"mysql.innodb_change_buffering":"sed -i 's/^innodb_change_buffering.*$/innodb_change_buffering=all/g' /etc/my.cnf"} ++{"mysql.sync_binlog":"sed -i 's/^sync_binlog.*$/sync_binlog=0/g' /etc/my.cnf"} ++{"mysql.innodb_read_io_threads":"sed -i 's/^innodb_read_io_threads.*$/innodb_read_io_threads=16/g' /etc/my.cnf"} ++{"mysql.innodb_max_dirty_pages_pct_lwm":"sed -i 's/^innodb_max_dirty_pages_pct_lwm.*$/innodb_max_dirty_pages_pct_lwm=10/g' /etc/my.cnf"} ++{"mysql.innodb_log_files_in_group":"sed -i 's/^innodb_log_files_in_group.*$/innodb_log_files_in_group=2/g' /etc/my.cnf"} ++{"mysql.innodb_log_file_size":"sed -i 's/^innodb_log_file_size.*$/innodb_log_file_size=50331648/g' /etc/my.cnf"} ++{"mysql.tmp_table_size":"sed -i 's/^tmp_table_size.*$/tmp_table_size=16777216/g' /etc/my.cnf"} ++{"mysql.innodb_page_cleaners":"sed -i 's/^innodb_page_cleaners.*$/innodb_page_cleaners=1/g' /etc/my.cnf"} ++{"mysql.innodb_adaptive_flushing_lwm":"sed -i 's/^innodb_adaptive_flushing_lwm.*$/innodb_adaptive_flushing_lwm=10/g' /etc/my.cnf"} ++{"mysql.innodb_sync_spin_loops":"sed -i 's/^innodb_sync_spin_loops.*$/innodb_sync_spin_loops=427/g' /etc/my.cnf"} ++{"mysql.innodb_purge_threads":"sed -i 's/^innodb_purge_threads.*$/innodb_purge_threads=32/g' /etc/my.cnf"} +\ No newline at end of file +diff --git a/copilot-tune/src/performance_optimizer/strategy_optimizer.py b/copilot-tune/src/performance_optimizer/strategy_optimizer.py +new file mode 100644 +index 0000000..44e4cdc +--- /dev/null ++++ b/copilot-tune/src/performance_optimizer/strategy_optimizer.py +@@ -0,0 +1,221 @@ ++import json ++import logging ++from typing import List, Dict, Optional, Tuple ++ ++from src.utils.constant import KNOWLEDGE_PATH ++from .base_optimizer import BaseOptimizer ++from src.utils.llm import get_llm_response ++import os ++ ++knowledge_base_path = "./src/knowledge_base/optimize/strategy/system.jsonl" ++if not os.path.exists(knowledge_base_path) or not os.path.isfile(knowledge_base_path): ++ knowledge_base_path = os.path.join(KNOWLEDGE_PATH, "optimize/strategy/system.jsonl") ++ ++ ++class StrategyOptimizer(BaseOptimizer): ++ def __init__(self, **kwargs): ++ super().__init__(**kwargs) ++ """ ++ 初始化性能优化推荐器 ++ ++ 参数: ++ knowledge_base_path: 策略知识库JSON文件路径 ++ """ ++ self.knowledge_base = self._load_knowledge_base(knowledge_base_path) ++ ++ def _load_knowledge_base(self, file_path: str) -> List[Dict]: ++ """加载策略知识库JSON文件""" ++ try: ++ with open(file_path, 'r', encoding='utf-8') as f: ++ return json.load(f) ++ except FileNotFoundError: ++ raise FileNotFoundError(f"知识库文件 {file_path} 未找到") ++ except json.JSONDecodeError: ++ raise ValueError(f"知识库文件 {file_path} 不是有效的JSON格式") ++ ++ def _filter_strategies(self, bottleneck: str) -> List[Dict]: ++ """ ++ 根据瓶颈点过滤策略 ++ ++ 参数: ++ bottleneck: 系统性能瓶颈点,可能值为[CPU, DISK, NETWORK, MEMORY, NONE] ++ ++ 返回: ++ 匹配的策略列表 ++ """ ++ if bottleneck == "NONE": ++ return [] ++ return [strategy for strategy in self.knowledge_base ++ if strategy["对应瓶颈点"].startswith(bottleneck)] ++ ++ def _generate_llm_prompt(self, strategies: List[Dict], bottleneck: str, ++ business_context: Optional[str] = None) -> str: ++ """ ++ 生成LLM提示词 ++ ++ 参数: ++ strategies: 候选策略列表 ++ bottleneck: 系统性能瓶颈点 ++ business_context: 当前业务场景描述 ++ ++ 返回: ++ 构造好的LLM提示词 ++ """ ++ strategies_info = "\n\n".join( ++ f"策略 {idx + 1}:\n" ++ f"名称: {s['策略名称']}\n" ++ f"瓶颈点: {s['对应瓶颈点']}\n" ++ f"功能说明: {s['功能说明']}\n" ++ f"使用风险: {s['使用风险']}\n" ++ for idx, s in enumerate(strategies) ++ ) ++ ++ prompt = ( ++ f"当前系统性能瓶颈点为: {bottleneck}\n" ++ f"{'当前业务场景为: ' + business_context if business_context else ''}\n\n" ++ f"以下是候选的优化策略:\n{strategies_info}\n\n" ++ "请根据以下标准评估并推荐top K条最优策略:\n" ++ "1. 策略与瓶颈点的匹配程度\n" ++ "2. 策略在当前业务场景下的适用性\n" ++ "3. 策略的风险与收益比\n" ++ "4. 策略的可配置性和易用性\n\n" ++ "请直接返回策略编号(如'策略1,策略3,策略5'),不需要解释原因。" ++ ) ++ ++ return prompt ++ ++ def recommend_strategies(self, bottleneck: str, top_k: int = 3, ++ business_context: Optional[str] = None) -> List[Dict]: ++ """ ++ 推荐优化策略 ++ ++ 参数: ++ bottleneck: 系统性能瓶颈点 ++ top_k: 返回的推荐策略数量 ++ business_context: 当前业务场景描述 ++ ++ 返回: ++ 推荐的策略列表(JSON格式) ++ """ ++ # 1. 过滤出相关策略 ++ candidate_strategies = self._filter_strategies(bottleneck) ++ logging.info(f">>> 过滤大类瓶颈后的策略数量:{len(candidate_strategies)}") ++ ++ if not candidate_strategies: ++ return [] ++ ++ # 2. 生成LLM提示词并获取响应 ++ logging.info(f">>> 根据策略功能说明,匹配合适top K策略:") ++ prompt = self._generate_llm_prompt(candidate_strategies, bottleneck, business_context) ++ llm_response = get_llm_response(prompt) ++ ++ # 3. 解析LLM响应 ++ try: ++ selected_indices = [] ++ for part in llm_response.split(','): ++ part = part.strip() ++ if part.startswith('策略'): ++ selected_indices.append(int(part[2:]) - 1) ++ elif part.isdigit(): ++ selected_indices.append(int(part) - 1) ++ ++ # 确保索引在有效范围内 ++ selected_indices = [idx for idx in selected_indices ++ if 0 <= idx < len(candidate_strategies)] ++ ++ # 如果LLM返回的推荐不足top_k,补充剩余的策略 ++ if len(selected_indices) < top_k: ++ remaining_indices = [i for i in range(len(candidate_strategies)) ++ if i not in selected_indices] ++ selected_indices.extend(remaining_indices[:top_k - len(selected_indices)]) ++ ++ # 取前top_k个策略 ++ selected_strategies = [candidate_strategies[i] for i in selected_indices[:top_k]] ++ ++ return selected_strategies ++ ++ except Exception as e: ++ logging.info(f"解析LLM响应失败: {e}") ++ # 如果解析失败,返回前top_k个策略 ++ return candidate_strategies[:top_k] ++ ++ def get_recommendations_json(self, bottleneck: str, top_k: int = 3, ++ business_context: Optional[str] = None) -> str: ++ """ ++ 获取JSON格式的推荐策略 ++ ++ 参数: ++ bottleneck: 系统性能瓶颈点 ++ top_k: 返回的推荐策略数量 ++ business_context: 当前业务场景描述 ++ ++ 返回: ++ JSON格式的推荐策略 ++ """ ++ recommendations = self.recommend_strategies(bottleneck, top_k, business_context) ++ return json.dumps(recommendations, ensure_ascii=False, indent=2) ++ ++ def think( ++ self, ++ history: List ++ ) -> Tuple[bool, str]: ++ if history == []: ++ self.args.bottle_neck = "CPU" ++ recommendations = self.recommend_strategies( ++ bottleneck=self.args.bottle_neck, ++ top_k=1, ++ business_context="高并发Web服务,CPU负载主要集中在用户态处理" ++ ) ++ logging.info(f">>> 匹配的策略数量:{len(recommendations)}") ++ cmd_list = [] ++ for strategy in recommendations: ++ logging.info(f">>> - 策略名称:{strategy['策略名称']}") ++ cmd_list.append(strategy['优化步骤']) ++ return False, self.get_bash_script(cmd_list) ++ else: ++ pass ++ ++ def get_bash_script( ++ self, ++ cmd_list: List ++ ) -> str: ++ # 脚本内容的开头部分 ++ script_header = ( ++ "#!/bin/bash\n\n" ++ "echo 'starting setting up strategy...'\n" ++ ) ++ ++ # 将命令列表转换为脚本中的行 ++ commands_str = "\n".join(cmd_list) + "\n" ++ ++ # 脚本内容的结尾部分 ++ script_footer = ( ++ "\necho 'set up strategy done!'\n" ++ ) ++ ++ script_content = script_header + commands_str + script_footer ++ return script_content ++ ++ ++# 示例使用 ++if __name__ == "__main__": ++ # 假设这是本地实现的LLM接口 ++ def get_llm_response(prompt: str) -> str: ++ # 这里应该是实际调用LLM的代码 ++ logging.info("\nLLM提示词:\n", prompt) ++ # 模拟LLM返回 ++ return "策略1" ++ ++ ++ # 初始化优化器 ++ optimizer = StrategyOptimizer("./knowledge_base/optimize/strategy/system.jsonl") ++ ++ # 获取推荐策略 ++ bottleneck = "CPU" # 输入的系统性能瓶颈点 ++ recommendations = optimizer.get_recommendations_json( ++ bottleneck, ++ top_k=1, ++ business_context="高并发Web服务,CPU负载主要集中在用户态处理" ++ ) ++ ++ logging.info("\n推荐策略:\n", recommendations) +diff --git a/copilot-tune/src/performance_test/__init__.py b/copilot-tune/src/performance_test/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/performance_test/pressure_test.py b/copilot-tune/src/performance_test/pressure_test.py +new file mode 100644 +index 0000000..d30d898 +--- /dev/null ++++ b/copilot-tune/src/performance_test/pressure_test.py +@@ -0,0 +1,60 @@ ++import time ++import threading ++ ++from src.utils.shell_execute import SshClient ++from src.utils.config.app_config import AppInterface ++from src.utils.common import ExecuteResult ++ ++_pressure_test_running = threading.Event() # 标志压测是否正在运行 ++_pressure_test_result = ExecuteResult( ++ status_code=-1, output=None, err_msg="pressure test not start yet!" ++) # 注意除了PressureTest线程能修改外,其他线程不应修改该结果 ++ ++ ++def wait_for_pressure_test(timeout=3600): ++ """ ++ 等待压测线程完成或超时。 ++ :param timeout: 超时时间(秒,默认3600秒) ++ :return: 压测结果或超时提示 ++ """ ++ start_time = time.time() ++ while _pressure_test_running.is_set(): ++ elapsed_time = time.time() - start_time ++ if elapsed_time > timeout: ++ raise TimeoutError(f"[PressureTest] waiting for pressure test timeout.") ++ time.sleep(1) ++ return _pressure_test_result ++ ++ ++class PressureTest(threading.Thread): ++ def __init__( ++ self, ++ app: str, ++ ssh_client: SshClient, ++ ): ++ super().__init__() ++ self.app = app ++ self.app_interface = AppInterface(ssh_client).get(app) ++ self._result = None ++ self.running = True ++ self.daemon = True ++ ++ def get_result(self): ++ return self._result ++ ++ def run(self): ++ global _pressure_test_result ++ global _pressure_test_running ++ try: ++ _pressure_test_running.set() ++ benchmark_result = self.app_interface.benchmark() ++ _pressure_test_result.status_code = 0 ++ _pressure_test_result.output = benchmark_result ++ except Exception as e: ++ _pressure_test_result.status_code = -1 ++ _pressure_test_result.err_msg = ( ++ f"pressure test failed, exception is {str(e)}" ++ ) ++ finally: ++ self.running = False ++ _pressure_test_running.clear() +diff --git a/copilot-tune/src/start_mcpserver.py b/copilot-tune/src/start_mcpserver.py +new file mode 100644 +index 0000000..a12ff57 +--- /dev/null ++++ b/copilot-tune/src/start_mcpserver.py +@@ -0,0 +1,228 @@ ++from mcp.server import FastMCP ++import logging ++from typing import Dict, Any ++ ++from fastapi import HTTPException ++from src.config import config ++from src.performance_analyzer.performance_analyzer import PerformanceAnalyzer ++from src.performance_collector.metric_collector import MetricCollector ++from src.performance_collector.micro_dep_collector import ( ++ MicroDepCollector, ++ COLLECTMODE, ++) ++from src.performance_collector.static_metric_profile_collector import ( ++ StaticMetricProfileCollector, ++) ++from src.performance_optimizer.param_recommender import ParamRecommender ++from src.performance_optimizer.strategy_optimizer import StrategyOptimizer ++from src.utils.config.app_config import AppInterface ++from src.utils.shell_execute import SshClient ++from src.start_tune import run_param_optimization, run_strategy_optimization ++ ++# ================= 全局配置与缓存 =================== ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++cache: Dict[str, Dict[str, Any]] = {} ++ ++# 创建MCP Server ++mcp = FastMCP("性能分析与优化 MCP Server", host="0.0.0.0", port=12147) ++ ++host_ip = config["servers"][0]["ip"] ++host_port = config["servers"][0]["port"] ++host_user = config["servers"][0]["host_user"] ++host_password = config["servers"][0]["password"] ++app_name = config["servers"][0]["app"] ++max_retries = config["servers"][0]["max_retries"] ++delay = config["servers"][0]["delay"] ++ ++ ++# ================= Collector 接口 =================== ++@mcp.tool( ++ name="Collector", ++ description="采集数据" ++) ++def run_collector(): ++ """ ++ 采集机器的性能指标,直接输出,不要发散和删减内容 ++ """ ++ ++ if not host_ip: ++ raise HTTPException( ++ status_code=400, detail=f"请参考部署使用指南,预设待调优机器IP,否则无法采集数据" ++ ) ++ ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++ ) ++ ++ # 1. 静态指标 ++ static_collector = StaticMetricProfileCollector( ++ ssh_client=ssh_client, max_workers=5 ++ ) ++ static_profile = static_collector.run() ++ ++ # 2. 动态指标 ++ metric_collector = MetricCollector( ++ ssh_client=ssh_client, ++ app=app_name, ++ pressure_test_mode=False, ++ ) ++ metrics = metric_collector.run() ++ ++ # 3. 微依赖分析(可选) ++ if config["feature"][0]["microDep_collector"]: ++ micro_collector = MicroDepCollector( ++ ssh_client=ssh_client, ++ iteration=10, ++ target_process_name=config["servers"][0]["target_process_name"], ++ benchmark_cmd=config["benchmark_cmd"], ++ mode=COLLECTMODE.DIRECT_MODE, ++ ) ++ micro_dep = micro_collector.run() ++ metrics["micro_dep"] = micro_dep ++ ++ # 缓存 ++ cache[host_ip] = {"metrics": metrics, "static_profile": static_profile} ++ ++ return { ++ "data": { ++ "static_profile": static_profile, ++ "metrics": metrics, ++ } ++ } ++ ++ ++# ================= Analyzer 接口 =================== ++@mcp.tool( ++ name="Analyzer", ++ description="分析采集到的数据" ++) ++def run_analyzer(): ++ """ ++ 对机器的性能瓶颈进行分析,前提是必须已经进行了数据的采集run_collector,直接输出,不要发散和删减内容 ++ """ ++ if not host_ip or host_ip not in cache or "metrics" not in cache[host_ip]: ++ raise HTTPException( ++ status_code=400, detail=f"{host_ip} 缺少 metrics,请先采集数据,再进行分析" ++ ) ++ ++ analyzer = PerformanceAnalyzer( ++ data=cache[host_ip]["metrics"], app=app_name ++ ) ++ report, bottleneck = analyzer.run() ++ cache[host_ip]["report"] = report ++ cache[host_ip]["bottleneck"] = bottleneck ++ ++ return {"report": report, "bottleneck": bottleneck} ++ ++ ++# ================= Optimizer(参数+策略)接口 =================== ++@mcp.tool( ++ name="Optimizer", ++ description="参数+策略" ++) ++def run_optimizer(): ++ """ ++ 优化机器的性能,推荐相应参数,前提是必须已经进行了数据的分析run_analyzer,直接输出,不要发散和删减内容 ++ """ ++ if ( ++ not host_ip ++ or host_ip not in cache ++ or "report" not in cache[host_ip] ++ or "static_profile" not in cache[host_ip] ++ ): ++ raise HTTPException( ++ status_code=400, ++ detail=f"{host_ip} 缺少 report 或 static_profile,请先执行 /collector 和 /analyzer", ++ ) ++ ++ # --- 参数优化 --- ++ ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++ ) ++ ++ param_recommender = ParamRecommender( ++ service_name=app_name, ++ slo_goal=0.1, ++ performance_metric=AppInterface(ssh_client) ++ .get(app_name) ++ .performance_metric, ++ static_profile=cache[host_ip]["static_profile"], ++ performance_analysis_report=cache[host_ip]["report"], ++ ssh_client=ssh_client, ++ ) ++ param_opt_result = param_recommender.run(history_result=None) ++ ++ # --- 策略优化 --- ++ strategy_opt = StrategyOptimizer( ++ application=app_name, ++ bottle_neck=cache[host_ip]["bottleneck"], # fallback ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ system_report=cache[host_ip]["report"], ++ target_config_path="", ++ ) ++ recommendations = strategy_opt.get_recommendations_json( ++ bottleneck=cache[host_ip]["bottleneck"], ++ top_k=1, ++ business_context="高并发Web服务,CPU负载主要集中在用户态处理", ++ ) ++ ++ return { ++ "param_optimization": param_opt_result, ++ "strategy_recommendation": recommendations, ++ } ++ ++ ++@mcp.tool( ++ name="StartTune", ++ description="开始调优" ++) ++def tune(): ++ """ ++ 此工具用于开始调优,只有用户明确需要开始调优才调用; ++ 此工具耗时预计1小时,需要提醒用户注意等待执行结束; ++ 结果在日志中查看 ,journalctl -xe -u tune-mcpserver --all -f ++ """ ++ feature_cfg = config["feature"][0] ++ report = cache[host_ip]["report"] ++ bottleneck = cache[host_ip]["bottleneck"] ++ server_cfg = config["servers"][0] ++ static_profile_info = cache[host_ip]["static_profile"] ++ ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++ ) ++ run_param_optimization( ++ server_cfg["app"], report, static_profile_info, ssh_client, ++ feature_cfg["need_restart_application"], feature_cfg["pressure_test_mode"], ++ feature_cfg["tune_system_param"], feature_cfg["tune_app_param"], feature_cfg["need_recover_cluster"], ++ feature_cfg["benchmark_timeout"] ++ ) ++ if feature_cfg["strategy_optimization"]: ++ run_strategy_optimization(ssh_client, server_cfg["app"], bottleneck, server_cfg, report) ++ return "调优执行完成" ++ ++ ++def main(): ++ mcp.run(transport='sse') ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/copilot-tune/src/start_tune.py b/copilot-tune/src/start_tune.py +new file mode 100644 +index 0000000..623cbf7 +--- /dev/null ++++ b/copilot-tune/src/start_tune.py +@@ -0,0 +1,167 @@ ++import logging ++ ++from src.performance_collector.micro_dep_collector import MicroDepCollector, COLLECTMODE ++from src.config import config ++from src.performance_analyzer.performance_analyzer import PerformanceAnalyzer ++from src.performance_collector.metric_collector import MetricCollector ++from src.performance_collector.static_metric_profile_collector import StaticMetricProfileCollector ++from src.performance_optimizer.param_optimizer import ParamOptimizer ++from src.performance_optimizer.strategy_optimizer import StrategyOptimizer ++from src.performance_test.pressure_test import PressureTest ++from src.utils.collector.collector_trigger import TriggerEventListener ++from src.utils.common import display_metrics ++from src.utils.shell_execute import SshClient ++ ++ ++def setup_logging(): ++ """配置日志格式与级别""" ++ logging.basicConfig( ++ level=logging.INFO, ++ format="%(asctime)s - %(levelname)s - %(message)s", ++ datefmt="%Y-%m-%d %H:%M:%S", ++ ) ++ ++ ++def create_ssh_client(server_cfg): ++ """根据配置创建 SSH 客户端""" ++ return SshClient( ++ host_ip=server_cfg["ip"], ++ host_port=server_cfg["port"], ++ host_user=server_cfg["host_user"], ++ host_password=server_cfg["password"], ++ max_retries=server_cfg["max_retries"], ++ delay=server_cfg["delay"], ++ ) ++ ++ ++def collect_static_metrics(ssh_client): ++ """采集静态指标""" ++ static_collector = StaticMetricProfileCollector(ssh_client=ssh_client, max_workers=5) ++ static_profile_info = static_collector.run() ++ display_metrics(static_profile_info["static"], headers=["指标名称", "指标值"]) ++ return static_profile_info ++ ++ ++def run_pressure_test_if_needed(server_cfg, ssh_client, enabled): ++ """如果启用压测模式,则执行压测准备和触发器""" ++ if not enabled: ++ return ++ logging.info("[Main] start pressure test ...") ++ pressure_test = PressureTest(server_cfg["app"], ssh_client) ++ listener = TriggerEventListener().configure( ++ host=server_cfg["ip"], ++ port=server_cfg["port"], ++ user=server_cfg["host_user"], ++ password=server_cfg["password"], ++ ) ++ listener.run() ++ pressure_test.start() ++ ++ ++def collect_runtime_metrics(ssh_client, server_cfg, pressure_test_mode): ++ """采集运行时性能指标""" ++ metric_collector = MetricCollector( ++ ssh_client=ssh_client, ++ app=server_cfg["app"], ++ pressure_test_mode=pressure_test_mode, ++ ) ++ data = metric_collector.run() ++ display_metrics(data, headers=["负载类型", "指标名", "指标值"]) ++ return data ++ ++ ++def collect_micro_dependencies_if_needed(ssh_client, data, server_cfg, need_micro_dep): ++ """如果需要,采集微依赖信息""" ++ if not need_micro_dep: ++ return data ++ micro_dep_collector = MicroDepCollector( ++ ssh_client=ssh_client, ++ iteration=10, ++ target_process_name=server_cfg["target_process_name"], ++ benchmark_cmd=config["benchmark_cmd"], ++ mode=COLLECTMODE.DIRECT_MODE, ++ ) ++ micro_dep_data = micro_dep_collector.run() ++ logging.info(f"MicroDepCollector data: {micro_dep_data}") ++ data["micro_dep"] = micro_dep_data ++ return data ++ ++ ++def analyze_performance(data, app): ++ """分析性能瓶颈""" ++ logging.info("[Main] analyzing performance ...") ++ analyzer = PerformanceAnalyzer(data=data, app=app) ++ return analyzer.run() ++ ++ ++def run_param_optimization(app, report, static_profile_info, ssh_client, need_restart, pressure_mode, tune_system_param, ++ tune_app_param, need_recover_cluster, benchmark_timeout): ++ """执行参数优化""" ++ ++ def slo_calc_callback(baseline, benchmark_result, symbol): ++ if baseline is None or abs(baseline) < 1e-9: ++ return 0.0 ++ return symbol * (benchmark_result - baseline) / baseline ++ ++ optimizer = ParamOptimizer( ++ service_name=app, ++ slo_goal=0.1, ++ analysis_report=report, ++ static_profile=static_profile_info, ++ ssh_client=ssh_client, ++ slo_calc_callback=slo_calc_callback, ++ max_iterations=20, ++ need_restart_application=need_restart, ++ pressure_test_mode=pressure_mode, ++ tune_system_param=tune_system_param, ++ tune_app_param=tune_app_param, ++ need_recover_cluster=need_recover_cluster, ++ benchmark_timeout=benchmark_timeout ++ ) ++ optimizer.run() ++ ++ ++def run_strategy_optimization(ssh_client, app, bottleneck, server_cfg, report): ++ """执行策略优化并输出推荐""" ++ strategy_optimizer = StrategyOptimizer( ++ application=app, ++ bottle_neck=bottleneck, ++ ssh_client=ssh_client, ++ system_report=report, ++ target_config_path="", ++ ) ++ recommendations = strategy_optimizer.get_recommendations_json( ++ bottleneck, top_k=1, business_context=server_cfg["business_context"] ++ ) ++ logging.info("推荐策略:", recommendations) ++ ++ ++def main(): ++ setup_logging() ++ server_cfg = config["servers"][0] ++ feature_cfg = config["feature"][0] ++ ++ ssh_client = create_ssh_client(server_cfg) ++ ++ static_profile_info = collect_static_metrics(ssh_client) ++ run_pressure_test_if_needed(server_cfg, ssh_client, feature_cfg["pressure_test_mode"]) ++ ++ metrics_data = collect_runtime_metrics(ssh_client, server_cfg, feature_cfg["pressure_test_mode"]) ++ metrics_data = collect_micro_dependencies_if_needed(ssh_client, metrics_data, server_cfg, ++ feature_cfg["microDep_collector"]) ++ ++ report, bottleneck = analyze_performance(metrics_data, server_cfg["app"]) ++ logging.info(f">>> PerformanceAnalyzer运行结果:{report} {bottleneck}") ++ ++ run_param_optimization( ++ server_cfg["app"], report, static_profile_info, ssh_client, ++ feature_cfg["need_restart_application"], feature_cfg["pressure_test_mode"], ++ feature_cfg["tune_system_param"], feature_cfg["tune_app_param"], feature_cfg["need_recover_cluster"], ++ feature_cfg["benchmark_timeout"] ++ ) ++ if feature_cfg["strategy_optimization"]: ++ run_strategy_optimization(ssh_client, server_cfg["app"], bottleneck, server_cfg, report) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/copilot-tune/src/start_workflow.py b/copilot-tune/src/start_workflow.py +new file mode 100644 +index 0000000..f53f95b +--- /dev/null ++++ b/copilot-tune/src/start_workflow.py +@@ -0,0 +1,193 @@ ++import logging ++from typing import Dict, Any ++ ++from fastapi import FastAPI, HTTPException ++ ++from src.config import config ++from src.performance_analyzer.performance_analyzer import PerformanceAnalyzer ++from src.performance_collector.metric_collector import MetricCollector ++from src.performance_collector.micro_dep_collector import ( ++ MicroDepCollector, ++ COLLECTMODE, ++) ++from src.performance_collector.static_metric_profile_collector import ( ++ StaticMetricProfileCollector, ++) ++from src.performance_optimizer.param_recommender import ParamRecommender ++from src.performance_optimizer.strategy_optimizer import StrategyOptimizer ++from src.utils.config.app_config import AppInterface ++from src.utils.shell_execute import SshClient ++from src.start_tune import main as start_tune ++ ++# ================= FastAPI 初始化 =================== ++app = FastAPI( ++ title="性能分析与优化 API", ++ description="统一接口:Collector / Analyzer / Optimizer", ++ version="1.0.0", ++) ++ ++# ================= 全局配置与缓存 =================== ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++cache: Dict[str, Dict[str, Any]] = {} ++ ++host_ip = config["servers"][0]["ip"] ++host_port = config["servers"][0]["port"] ++host_user = config["servers"][0]["host_user"] ++host_password = config["servers"][0]["password"] ++app_name = config["servers"][0]["app"] ++max_retries = config["servers"][0]["max_retries"] ++delay = config["servers"][0]["delay"] ++ ++ ++# ================= Collector 接口 =================== ++@app.get("/collector") ++def run_collector(): ++ if not host_ip: ++ raise HTTPException( ++ status_code=400, detail=f"需要输入待调优机器IP,否则无法采集数据" ++ ) ++ ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++ ) ++ ++ # 1. 静态指标 ++ static_collector = StaticMetricProfileCollector( ++ ssh_client=ssh_client, max_workers=5 ++ ) ++ static_profile = static_collector.run() ++ ++ # 2. 动态指标 ++ metric_collector = MetricCollector( ++ ssh_client=ssh_client, ++ app=app_name, ++ pressure_test_mode=False, ++ ) ++ metrics = metric_collector.run() ++ ++ # 3. 微依赖分析(可选) ++ if config["feature"][0]["microDep_collector"]: ++ micro_collector = MicroDepCollector( ++ ssh_client=ssh_client, ++ iteration=10, ++ target_process_name=config["servers"][0]["target_process_name"], ++ benchmark_cmd=config["benchmark_cmd"], ++ mode=COLLECTMODE.DIRECT_MODE, ++ ) ++ micro_dep = micro_collector.run() ++ metrics["micro_dep"] = micro_dep ++ ++ # 缓存 ++ cache[host_ip] = {"metrics": metrics, "static_profile": static_profile} ++ ++ return { ++ "data": { ++ "static_profile": static_profile, ++ "metrics": metrics, ++ } ++ } ++ ++ ++# ================= Analyzer 接口 =================== ++@app.get("/analyzer") ++def run_analyzer(): ++ if not host_ip or host_ip not in cache or "metrics" not in cache[host_ip]: ++ raise HTTPException( ++ status_code=400, detail=f"{host_ip} 缺少 metrics,请先采集数据,再进行分析" ++ ) ++ ++ analyzer = PerformanceAnalyzer( ++ data=cache[host_ip]["metrics"], app=app_name ++ ) ++ report, bottleneck = analyzer.run() ++ cache[host_ip]["report"] = report ++ cache[host_ip]["bottleneck"] = bottleneck ++ ++ return {"report": report, "bottleneck": bottleneck} ++ ++ ++# ================= Optimizer(参数+策略)接口 =================== ++@app.get("/optimizer") ++def run_optimizer(): ++ if ( ++ not host_ip ++ or host_ip not in cache ++ or "report" not in cache[host_ip] ++ or "static_profile" not in cache[host_ip] ++ ): ++ raise HTTPException( ++ status_code=400, ++ detail=f"{host_ip} 缺少 report 或 static_profile,请先执行 /collector 和 /analyzer", ++ ) ++ ++ # --- 参数优化 --- ++ ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++ ) ++ ++ param_recommender = ParamRecommender( ++ service_name=app_name, ++ slo_goal=0.1, ++ performance_metric=AppInterface(ssh_client) ++ .get(app_name) ++ .performance_metric, ++ static_profile=cache[host_ip]["static_profile"], ++ performance_analysis_report=cache[host_ip]["report"], ++ ssh_client=ssh_client, ++ ) ++ param_opt_result = param_recommender.run(history_result=None) ++ ++ # --- 策略优化 --- ++ strategy_opt = StrategyOptimizer( ++ application=app_name, ++ bottle_neck=cache[host_ip]["bottleneck"], # fallback ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ system_report=cache[host_ip]["report"], ++ target_config_path="", ++ ) ++ recommendations = strategy_opt.get_recommendations_json( ++ bottleneck=cache[host_ip]["bottleneck"], ++ top_k=1, ++ business_context="高并发Web服务,CPU负载主要集中在用户态处理", ++ ) ++ ++ return { ++ "param_optimization": param_opt_result, ++ "strategy_recommendation": recommendations, ++ } ++ ++ ++# ================= tune(开始调优)接口 =================== ++@app.get("/start_tune") ++def tune(): ++ """ ++ 此工具用于开始调优,只有用户明确需要开始调优才调用; ++ 此工具耗时预计1小时,需要提醒用户注意等待执行结束; ++ 结果在日志中查看 ,journalctl -xe -u tune-mcpserver --all -f ++ """ ++ start_tune() ++ return "调优执行完成" ++ ++ ++def main(): ++ import uvicorn ++ ++ uvicorn.run(app=app, host="0.0.0.0", port=8092) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/copilot-tune/src/tests/manager/task_manager.py b/copilot-tune/src/tests/manager/task_manager.py +new file mode 100644 +index 0000000..679680b +--- /dev/null ++++ b/copilot-tune/src/tests/manager/task_manager.py +@@ -0,0 +1,47 @@ ++import logging ++import pyfiglet ++from tabulate import tabulate ++ ++from src.tests.mock_ssh_client import SshClient ++from src.utils.manager.task_manager import TaskManager ++from src.utils.shell_execute import SshClient ++from src.config import config ++from src.performance_collector.application import pgsql_collector ++from src.performance_analyzer.application.pgsql_analyzer import PgsqlAnalyzer ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++host_ip = config["servers"][0]["ip"] ++host_port = config["servers"][0]["port"] ++host_user = config["servers"][0]["host_user"] ++host_password = config["servers"][0]["password"] ++app = config["servers"][0]["app"] ++max_retries = config["servers"][0]["max_retries"] ++delay = config["servers"][0]["delay"] ++target_process_name = config["servers"][0]["target_process_name"] ++benchmark_cmd = config["benchmark_cmd"] ++need_restart_application = config["feature"][0]["need_restart_application"] ++need_microDep_collector = config["feature"][0]["microDep_collector"] ++ ++ssh_client = SshClient( ++ host_ip=host_ip, ++ host_port=host_port, ++ host_user=host_user, ++ host_password=host_password, ++ max_retries=max_retries, ++ delay=delay, ++) ++ ++task_manager = TaskManager( ++ ssh_client=ssh_client, ++ modules=[pgsql_collector], ++ global_trigger_mode=False, ++ timeout=60, ++ debug=True ++) ++ ++result = task_manager.run() ++ ++print(result) +diff --git a/copilot-tune/src/tests/manager/test_trigger_signal.py b/copilot-tune/src/tests/manager/test_trigger_signal.py +new file mode 100644 +index 0000000..26c2573 +--- /dev/null ++++ b/copilot-tune/src/tests/manager/test_trigger_signal.py +@@ -0,0 +1,34 @@ ++import threading ++from src.utils.collector.collector_trigger import TriggerEventListener ++ ++ ++def wait_thread(name, listener): ++ print(f"[{name}] 开始等待信号...") ++ status = listener.wait() ++ print(f"[{name}] 结束等待,状态为:{status.name}") ++ ++ ++def main(): ++ listener = TriggerEventListener(timeout=2) # 设置较长超时,等你手动写信号 ++ listener.run() ++ ++ import time ++ time.sleep(3) ++ # 启动两个等待线程 ++ threads = [] ++ for i in range(2): ++ t = threading.Thread(target=wait_thread, args=(f"Worker-{i+1}", listener)) ++ t.start() ++ threads.append(t) ++ ++ # 不自动触发信号,注释掉模拟线程 ++ # threading.Thread(target=simulate_fifo_signal, daemon=True).start() ++ ++ for t in threads: ++ t.join() ++ ++ print("主线程最终状态:", listener.get_status().name) ++ ++ ++if __name__ == "__main__": ++ main() +diff --git a/copilot-tune/src/tests/mock_ssh_client.py b/copilot-tune/src/tests/mock_ssh_client.py +new file mode 100644 +index 0000000..eae95b3 +--- /dev/null ++++ b/copilot-tune/src/tests/mock_ssh_client.py +@@ -0,0 +1,17 @@ ++class Result: ++ def __init__(self, status_code, output, cmd): ++ self.status_code = status_code ++ self.output = output ++ self.cmd = cmd ++ ++ ++class SshClient: ++ def __init__(self): ++ self.host_ip = "127.0.0.1" ++ self.host_port = 22 ++ ++ def run_cmd(self, cmd): ++ return Result(0, "12", cmd) ++ ++ def run_local_cmd(self, cmd): ++ return Result(0, "12", cmd) +diff --git a/copilot-tune/src/tests/test_perf_optim/param_knowledge.py b/copilot-tune/src/tests/test_perf_optim/param_knowledge.py +new file mode 100644 +index 0000000..4595807 +--- /dev/null ++++ b/copilot-tune/src/tests/test_perf_optim/param_knowledge.py +@@ -0,0 +1,10 @@ ++from src.performance_optimizer.param_knowledge import ParamKnowledge ++from src.tests.mock_ssh_client import SshClient ++ ++ ++ssh_client = SshClient() ++param_knowledge = ParamKnowledge(ssh_client) ++res = param_knowledge.describe_param_background_knob( ++ "mysql", ["innodb_adaptive_flushing"] ++) ++print(res) +diff --git a/copilot-tune/src/tests/test_utils/collector/collector_trigger.py b/copilot-tune/src/tests/test_utils/collector/collector_trigger.py +new file mode 100644 +index 0000000..72f79fc +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/collector/collector_trigger.py +@@ -0,0 +1,30 @@ ++from src.utils.collector.collector_trigger import fifo_signal_monitor, no_signal_monitor, TriggerEventListener ++ ++ ++trigger_event_listener = TriggerEventListener() ++def test_fifo_signal_monitor(): ++ print("开始测试...") ++ with fifo_signal_monitor(timeout=5) as signal_received: ++ if signal_received: ++ print("在超时时间内接收到信号,继续执行...") ++ else: ++ print("超时未接收到信号,终止执行。") ++ ++ ++def test_no_fifo_signal_monitor(): ++ print("开始测试...") ++ with no_signal_monitor(timeout=30) as signal_received: ++ if signal_received: ++ print("在超时时间内接收到信号,继续执行...") ++ else: ++ print("超时未接收到信号,终止执行。") ++ ++ ++# test_fifo_signal_monitor() ++# test_no_fifo_signal_monitor() ++ ++trigger_event_listener.run() ++ ++trigger_event_listener.wait() ++ ++print("triggered signal") +diff --git a/copilot-tune/src/tests/test_utils/collector/metric_collector.py b/copilot-tune/src/tests/test_utils/collector/metric_collector.py +new file mode 100644 +index 0000000..25b2406 +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/collector/metric_collector.py +@@ -0,0 +1,66 @@ ++import random ++from src.tests.test_utils.collector import test_cpu_collector ++from src.utils.thread_pool import thread_pool_manager, serial_task_manager ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++ CollectType, ++ get_registered_module_tasks, ++) ++ ++ ++class Result: ++ def __init__(self, status_code, output, cmd): ++ self.status_code = status_code ++ self.output = output ++ self.cmd = cmd ++ ++ def __repr__(self): ++ return self.output ++ ++ ++class SshClient: ++ def __init__(self): ++ self.host_ip = "127.0.0.1" ++ self.host_port = 22 ++ ++ def run_cmd(self, cmd): ++ return Result(0, str(random.uniform(0.6, 0.8)), cmd) ++ ++ ++# 声明ssh client连接 ++ssh_client = SshClient() ++# 获取test_cpu_collector通过修饰器注册的异步任务 ++async_tasks = get_registered_module_tasks(test_cpu_collector, CollectMode.ASYNC) ++# 获取test_cpu_collector通过修饰器注册的同步任务 ++sync_tasks = get_registered_module_tasks(test_cpu_collector, CollectMode.SYNC) ++ ++# 异步任务入线程池 ++thread_pool_manager.add_batch( ++ [(func_info["func"], (ssh_client,)) for func_info in async_tasks] ++) ++ ++# 同步任务进入串行任务池 ++serial_task_manager.add_batch( ++ [(func_info["func"], (ssh_client,)) for func_info in sync_tasks] ++) ++ ++ ++# 异步任务利用run_all_task()接口执行任务 ++thread_pool_manager.run_all_tasks() ++# 异步任务利用get_all_results()接口阻塞程序,等待所有线程都ready ++# 注意每次执行完一批任务后,下次执行会清空所有上一步的任务和结果,需要重新add任务 ++task_results = thread_pool_manager.get_all_results() ++ ++print("*" * 30 + "async results" + "*" * 30) ++print(task_results) ++print("*" * 30 + "async results" + "*" * 30) ++ ++ ++# 同步任务同理 ++serial_task_manager.run_all_tasks() ++task_results = serial_task_manager.get_all_results() ++print("*" * 30 + "sync results" + "*" * 30) ++print(task_results) ++print("*" * 30 + "sync results" + "*" * 30) +diff --git a/copilot-tune/src/tests/test_utils/collector/test_cpu_collector.py b/copilot-tune/src/tests/test_utils/collector/test_cpu_collector.py +new file mode 100644 +index 0000000..eed1864 +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/collector/test_cpu_collector.py +@@ -0,0 +1,50 @@ ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++ CollectType, ++) ++ ++ ++@period_task( ++ cmd="top", ++ delay=2, ++ sample_count=10, ++ interval=1, ++ collect_mode=CollectMode.ASYNC, ++ collect_type=CollectType.DIRECT, ++) ++def cpu_usage_parser(output_list: list[str]): ++ total_cpu_usage = 0.0 ++ for output in output_list: ++ if not output: ++ continue ++ total_cpu_usage += float(output) ++ return {"avg_cpu_usage": total_cpu_usage / len(output_list)} ++ ++ ++@snapshot_task( ++ cmd="aaa", collect_mode=CollectMode.ASYNC, collect_type=CollectType.TRIGGERED ++) ++def numa_parser(output: str): ++ return {"numa_number": output} ++ ++ ++@snapshot_task( ++ cmd="test", collect_mode=CollectMode.SYNC, collect_type=CollectType.DIRECT ++) ++def memory_usage_parser(output: str): ++ import time ++ ++ time.sleep(5) ++ return {"memory_usage": output} ++ ++ ++@snapshot_task( ++ cmd="test", collect_mode=CollectMode.SYNC, collect_type=CollectType.DIRECT ++) ++def disk_usage_parser(output: str): ++ import time ++ ++ time.sleep(2) ++ return {"disk_usage": output} +diff --git a/copilot-tune/src/tests/test_utils/collector/test_io_collector.py b/copilot-tune/src/tests/test_utils/collector/test_io_collector.py +new file mode 100644 +index 0000000..c2b178c +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/collector/test_io_collector.py +@@ -0,0 +1,50 @@ ++from src.utils.collector.metric_collector import ( ++ period_task, ++ snapshot_task, ++ CollectMode, ++ CollectType, ++) ++ ++ ++@period_task( ++ cmd="top", ++ delay=2, ++ sample_count=10, ++ interval=1, ++ collect_mode=CollectMode.ASYNC, ++ collect_type=CollectType.DIRECT, ++) ++def hbm_usage_parser(output_list: list[str]): ++ total_cpu_usage = 0.0 ++ for output in output_list: ++ if not output: ++ continue ++ total_cpu_usage += float(output) ++ return {"avg_cpu_usage": total_cpu_usage / len(output_list)} ++ ++ ++@snapshot_task( ++ cmd="aaa", collect_mode=CollectMode.ASYNC, collect_type=CollectType.TRIGGERED ++) ++def eth_parser(output: str): ++ return {"numa_number": output} ++ ++ ++@snapshot_task( ++ cmd="test", collect_mode=CollectMode.SYNC, collect_type=CollectType.DIRECT ++) ++def fio_usage_parser(output: str): ++ import time ++ ++ time.sleep(5) ++ return {"memory_usage": output} ++ ++ ++@snapshot_task( ++ cmd="test", collect_mode=CollectMode.SYNC, collect_type=CollectType.DIRECT ++) ++def system_usage_parser(output: str): ++ import time ++ ++ time.sleep(40) ++ return {"disk_usage": output} +diff --git a/copilot-tune/src/tests/test_utils/config/app_config.py b/copilot-tune/src/tests/test_utils/config/app_config.py +new file mode 100644 +index 0000000..7c4819a +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/config/app_config.py +@@ -0,0 +1,23 @@ ++from src.utils.config.app_config import AppInterface ++from src.tests.mock_ssh_client import SshClient ++ ++ssh_client = SshClient() ++app_interface = AppInterface(ssh_client) ++mysql = app_interface.mysql ++sys = app_interface.system ++ ++res = mysql.get_param("hello") ++print(res.cmd) ++res = mysql.set_param("hello", 1) ++print(res.cmd) ++res = mysql.start_workload() ++print(res.cmd) ++res = mysql.stop_workload() ++print(res.cmd) ++res = mysql.benchmark() ++print(res.cmd) ++ ++res = sys.get_param("aaa") ++print(res.cmd) ++res = sys.set_param("bbb", 2) ++print(res.cmd) +diff --git a/copilot-tune/src/tests/test_utils/test_pressure_test.py b/copilot-tune/src/tests/test_utils/test_pressure_test.py +new file mode 100644 +index 0000000..4012dd2 +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/test_pressure_test.py +@@ -0,0 +1,43 @@ ++# start_workflow.py ++from src.performance_test.pressure_test import PressureTest, wait_for_pressure_test ++import time ++ ++ ++class SshClient: ++ def __init__(self): ++ self.host_ip = "127.0.0.1" ++ self.host_port = 22 ++ ++ def run_cmd(self, cmd): ++ print(cmd) ++ time.sleep(5) ++ return 123.4 ++ ++ def run_local_cmd(self, cmd): ++ print(cmd) ++ time.sleep(5) ++ return 333.1 ++ ++ ++if __name__ == "__main__": ++ # 初始化SSH客户端 ++ ssh_client = SshClient() ++ ++ # 创建压测线程 ++ app = "mysql" ++ thread = PressureTest(app, ssh_client) ++ thread.start() ++ ++ # 等待压测完成或超时 ++ result = wait_for_pressure_test(timeout=300) ++ ++ # 打印结果 ++ if isinstance(result, str): ++ print(result) ++ else: ++ if result.status_code == 0: ++ print("压测成功,结果如下:") ++ print(result.output) ++ else: ++ print("压测失败,错误信息如下:") ++ print(result.err_msg) +diff --git a/copilot-tune/src/tests/test_utils/thread_pool.py b/copilot-tune/src/tests/test_utils/thread_pool.py +new file mode 100644 +index 0000000..1d27183 +--- /dev/null ++++ b/copilot-tune/src/tests/test_utils/thread_pool.py +@@ -0,0 +1,29 @@ ++import time ++from src.utils.thread_pool import SerialTaskManager ++ ++ ++def task1(): ++ time.sleep(10) ++ return "Task 1 completed" ++ ++ ++def task2(): ++ time.sleep(40) ++ return "Task 2 completed" ++ ++ ++def task3(x, y): ++ return x + y ++ ++start_time = time.time() ++manager = SerialTaskManager() ++manager.add_task(task1) ++manager.add_task(task2) ++manager.add_task(task3, 5, 7) ++ ++results = manager.get_all_results() ++for result in results: ++ print(result) ++ ++end_time = time.time() ++print(f"cost time {end_time - start_time} s") +diff --git a/copilot-tune/src/utils/README.md b/copilot-tune/src/utils/README.md +new file mode 100644 +index 0000000..69b91ee +--- /dev/null ++++ b/copilot-tune/src/utils/README.md +@@ -0,0 +1,169 @@ ++# 采集框架开发指南 ++ ++由于采集数据模块需要频繁调用shell命令,因此新增数据采集框架用于更加便捷地扩展新增采集指标,采集框架代码在src/utils目录下,下面是针对utils目录各代码文件的功能描述。 ++ ++```shell ++├── __init__.py ++├── common.py 公共数据结构 ++├── json_repair.py 从大模型回复中获取json ++├── llm.py 大模型统一接口调用模块 ++├── metrics.py 指标定义和含义 ++├── rag ++│   ├── __init__.py ++│   └── knob_rag.py rag检索,推荐参数模块 ++├── shell_execute.py 远程执行命令模块 ++└── thread_pool.py 线程池,用于批量并行执行任务 ++``` ++ ++ ++ ++## 1.采集模块接口介绍 ++ ++### SshClient接口 ++ ++类定义如下: ++ ++```python ++class SshClient: ++ def __init__( ++ self, ++ host_ip: str = "", ++ host_port: int = 22, ++ host_user: str = "root", ++ host_password: str = "", ++ max_retries: int = 0, ++ delay: float = 1.0, ++ ): ++ self.host_ip = host_ip ++ self.host_port = host_port ++ self.host_user = host_user ++ self.host_password = host_password ++ ++ self.max_retries = max_retries ++ self.delay = delay ++``` ++ ++| 参数名 | 参数类型 | 参数取值范围 | 参数含义说明 | ++| ------------- | -------- | ------------ | -------------------------------------------------- | ++| host_ip | 字符串 | - | 远程连接ssh客户端的ip | ++| host_port | 整型 | 0~65535 | ssh连接的端口号,一般默认是22 | ++| host_user | 字符串 | - | ssh连接的用户名 | ++| host_password | 字符串 | - | ssh连接的密码 | ++| max_retries | 整型 | 0~10 | 重试次数,当远程连接失败或者命令执行失败时进行重试 | ++| delay | 浮点型 | - | 每次失败等待多久后重试 | ++ ++ ++ ++执行cmd命令接口如下: ++ ++```python ++def run_cmd(self, cmd: str) -> ExecuteResult: ++``` ++ ++| 参数名 | 参数类型 | 参数取值范围 | 参数含义说明 | ++| ------ | -------- | ------------ | --------------------- | ++| cmd | 字符串 | - | 远程连接ssh客户端的ip | ++ ++ ++ ++返回值为ExecuteResult类型,定义如下: ++ ++```python ++class ExecuteResult: ++ def __init__(self, status_code: int = -1, output: Any = None, err_msg: str = ""): ++ self.status_code = status_code ++ self.output = output ++ self.err_msg = err_msg ++``` ++ ++ ++ ++| 参数名 | 参数类型 | 参数取值范围 | 参数含义说明 | ++| ----------- | -------- | ------------ | --------------------- | ++| status_code | 字符串 | -255~255 | shell命令执行结果状态 | ++| output | 任意类型 | - | 输出结果 | ++| err_msg | 字符串 | - | 错误信息 | ++ ++ ++ ++### cmd_pipeline接口 ++ ++cmd_pipeline接口定义如下: ++ ++```python ++def cmd_pipeline( ++ cmd: str = "", ++ tag: str = "default_tag", ++ parallel: bool = False, ++) -> ExecuteResult: ++``` ++ ++| 参数名 | 参数类型 | 参数取值范围 | 参数含义说明 | ++| ----------- | -------- | ------------ | ------------------------------------------ | ++| status_code | 字符串 | - | 待执行shell命令字符串 | ++| tag | 字符串 | - | 任务标签,用于区分任务类型,用户可自行定义 | ++| parallel | 布尔类型 | - | 是否并行执行 | ++ ++**cmd_pipeline**是一个修饰器,用于修饰在命令解析的函数上,使用该修饰器具备两个功能: ++ +++ 自动执行shell命令,将shell命令结果返回给被修饰函数,在扩展数据采集接口时只需要关注如何解析输出结果即可 +++ 自动注册任务,通过模块名即可获取注册的任务,便于批量提交任务到线程池执行 ++ ++首先开发者需要开发一个解析命令行结果的函数,需要满足如下定义,入参output即为上方修饰器中cmd指定的shell命令输出结果,而输出类型不限定,一般设置为dict类型: ++ ++```python ++@cmd_pipeline(cmd="ulimit -n", tag="static", parallel=True) ++def fdlimit_parser(output: str) -> dict: ++ """解析 ulimit -n 输出:文件描述符上限""" ++ return {"最大文件描述符": int(output.strip())} ++``` ++ ++被cmd_pipeline修饰的函数会被封装成一个新的函数,入参为SshClient类型,输出为ExecuteResult类型,调用方法如下: ++ ++```python ++result = fdlimit_parser(ssh_client) ++``` ++ ++这样就会函数原本的输出就会被封装到ExecuteResult.output中,打印result可以看到如下结果: ++ ++```shell ++{'status_code': 0, 'err_msg': '', 'output': {'最大文件描述符': 1024}} ++``` ++ ++这样一个数据采集接口就开发完成了。 ++ ++ ++ ++当我们在某个模块定义好了一系列的数据采集接口后,可以通过模块名获取这些接口,例如:在src.performance_collector.static_profile_collector模块定义了数据采集接口,在其他模块可以通过如下方式获取该python模块注册的所有数据采集接口: ++ ++```python ++from src.performance_collector import static_profile_collector ++from src.utils.shell_execute import get_registered_cmd_funcs ++ ++data_collector_funcs = get_registered_cmd_funcs(static_profile_collector) ++print(data_collector_funcs) ++``` ++ ++这样就可以看到该模块定义的所有数据采集接口: ++ ++```shell ++['func': , 'tag': 'static', ...] ++``` ++ ++ ++ ++## 2.并行任务调度接口 ++ ++在采集数据框架中,经常会遇到某些任务需要并行执行提升采集数据效率的情况,此时可以使用并行任务调度的接口,这里并行任务也被集成到框架中可以使用,并行任务调度接口定义为: ++ ++```python ++class ThreadPoolManager: ++ def __init__(self, max_workers: int = 5): ++``` ++ ++| 参数名 | 参数类型 | 参数取值范围 | 参数含义说明 | ++| ----------- | -------- | ------------ | ------------------ | ++| max_workers | int | 1-16 | 同时工作的线程数量 | ++ ++> 注意:由于python的GIL限制,多线程仅适用于IO密集型任务,若为计算密集型任务,使用并行调度不会有效,这里远程执行cmd命令也属于IO密集型任务,推荐此种情况下使用并行接口 ++ +diff --git a/copilot-tune/src/utils/__init__.py b/copilot-tune/src/utils/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/utils/collector/__init__.py b/copilot-tune/src/utils/collector/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/utils/collector/collector_trigger.py b/copilot-tune/src/utils/collector/collector_trigger.py +new file mode 100644 +index 0000000..0cd4997 +--- /dev/null ++++ b/copilot-tune/src/utils/collector/collector_trigger.py +@@ -0,0 +1,202 @@ ++import logging ++import threading ++import time ++from enum import Enum, auto ++ ++import paramiko ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++# FIFO 文件路径 ++FIFO_PATH = "/tmp/euler-copilot-fifo" ++MAX_WAIT_TIMEOUT = 300 ++ ++ ++class TriggerStatus(Enum): ++ WAITING = auto() ++ TRIGGERED = auto() ++ CLOSE = auto() ++ ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++class TriggerEventListener: ++ """ ++ 单例:在后台线程里通过 SSH 轮询远程文件内容, ++ 当内容为 '1' 时把状态置为 TRIGGERED。 ++ """ ++ _instance = None ++ _lock = threading.Lock() ++ ++ def __new__(cls, *a, **kw): ++ with cls._lock: ++ if cls._instance is None: ++ cls._instance = super().__new__(cls) ++ return cls._instance ++ ++ def __init__(self): ++ if getattr(self, "_ready", False): ++ return ++ self._ready = True ++ ++ # 远程信息(可在 configure 中修改) ++ self.host = None ++ self.port = 22 ++ self.user = None ++ self.password = None ++ self.remote_path = FIFO_PATH ++ ++ self.timeout = 300 ++ self.poll_interval = 1.0 # 秒 ++ ++ self._status = TriggerStatus.WAITING ++ self._status_lock = threading.Lock() ++ self._cond = threading.Condition(self._status_lock) ++ self._thread = None ++ self._stop_evt = threading.Event() ++ ++ # ---------- 配置 ---------- ++ def configure(self, host, port, user, password): ++ if self._thread and self._thread.is_alive(): ++ logging.warning("RemoteSSHTrigger already running, ignore configure.") ++ return self ++ self.host, self.port = host, port ++ self.user, self.password = user, password ++ self.remote_path = FIFO_PATH ++ self.timeout = 300 ++ self.poll_interval = 1.0 ++ return self ++ ++ # ---------- 状态 ---------- ++ def get_status(self) -> TriggerStatus: ++ with self._status_lock: ++ return self._status ++ ++ def wait(self, timeout=None): ++ with self._cond: ++ if self._status in (TriggerStatus.TRIGGERED, TriggerStatus.CLOSE): ++ return self._status ++ if timeout is None: ++ while self._status == TriggerStatus.WAITING: ++ self._cond.wait() ++ else: ++ end = time.time() + timeout ++ while self._status == TriggerStatus.WAITING: ++ left = end - time.time() ++ if left <= 0: ++ break ++ self._cond.wait(timeout=left) ++ return self._status ++ ++ def _set_status(self, new_status: TriggerStatus): ++ with self._cond: ++ if self._status in (TriggerStatus.TRIGGERED, TriggerStatus.CLOSE): ++ return ++ self._status = new_status ++ self._cond.notify_all() ++ ++ # ---------- 启动 ---------- ++ def run(self): ++ if self._thread and self._thread.is_alive(): ++ logging.warning("already running") ++ return ++ self._stop_evt.clear() ++ self._set_status(TriggerStatus.WAITING) ++ self._thread = threading.Thread(target=self._worker, daemon=True) ++ self._thread.start() ++ logging.info("RemoteSSHTrigger started polling %s@%s:%s", ++ self.user, self.host, self.remote_path) ++ ++ def stop(self): ++ self._stop_evt.set() ++ if self._thread: ++ self._thread.join(timeout=5) ++ ++ # ---------- 后台线程 ---------- ++ def _worker(self): ++ start = time.time() ++ ssh = None ++ try: ++ ssh = self._connect() ++ while not self._stop_evt.is_set(): ++ if time.time() - start >= self.timeout: ++ self._set_status(TriggerStatus.CLOSE) ++ break ++ try: ++ val = self._read_remote(ssh).strip() ++ if val == "1": ++ self._delete_remote(ssh) ++ self._set_status(TriggerStatus.TRIGGERED) ++ break ++ except Exception as e: ++ logging.warning("read error: %s", e) ++ # 可重连 ++ ssh = self._reconnect(ssh) ++ ++ time.sleep(self.poll_interval) ++ finally: ++ if ssh: ++ ssh.close() ++ ++ def _connect(self): ++ ssh = paramiko.SSHClient() ++ ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ++ pkey = None ++ ssh.connect(self.host, port=self.port, ++ username=self.user, ++ password=self.password, ++ pkey=pkey, ++ timeout=10) ++ return ssh ++ ++ def _reconnect(self, old_ssh): ++ try: ++ old_ssh.close() ++ except: ++ pass ++ return self._connect() ++ ++ def _read_remote(self, ssh): ++ cmd = f"cat {self.remote_path}" ++ _, stdout, _ = ssh.exec_command(cmd, timeout=5) ++ return stdout.read().decode() ++ ++ def _delete_remote(self, ssh): ++ """删除 remote_path,失败仅警告""" ++ cmd = f"rm -f {self.remote_path}" ++ try: ++ _, stdout, stderr = ssh.exec_command(cmd, timeout=5) ++ exit_code = stdout.channel.recv_exit_status() ++ if exit_code == 0: ++ logging.debug("已删除远程文件 %s", self.remote_path) ++ else: ++ logging.debug("删除远程文件失败,exit=%s, err=%s", ++ exit_code, stderr.read().decode()) ++ except Exception as e: ++ logging.debug("删除远程文件异常: %s", e) ++ ++ ++if __name__ == "__main__": ++ # 1. 配置 ++ listener = TriggerEventListener().configure( ++ host="9.82.36.53", ++ user="root", ++ password="Huawei12#$", ++ port="22" ++ ++ ) ++ ++ # 2. 启动 ++ listener.run() ++ ++ # 3. 等待触发 ++ status = listener.wait() ++ print("trigger status:", status) ++ ++ # 4. 后续逻辑 ++ print("继续在本机执行其他命令...") +diff --git a/copilot-tune/src/utils/collector/metric_collector.py b/copilot-tune/src/utils/collector/metric_collector.py +new file mode 100644 +index 0000000..eb7c661 +--- /dev/null ++++ b/copilot-tune/src/utils/collector/metric_collector.py +@@ -0,0 +1,229 @@ ++import logging ++import inspect ++import traceback ++ ++from time import sleep ++from enum import Enum ++from functools import wraps ++from typing import Callable, Any ++from types import ModuleType ++from collections import defaultdict ++from src.utils.common import ExecuteResult ++from src.utils.collector.collector_trigger import TriggerStatus, TriggerEventListener ++ ++MAX_SAMPLE_COUNT = 100 ++MAX_SAMPLE_INTERVAL = 600 ++MAX_TASK_TIMEOUT = 300 ++ ++trigger_event_listener = TriggerEventListener() ++ ++ ++class CollectMode(Enum): ++ SYNC = "sync" ++ ASYNC = "async" ++ ++ ++class CollectType(Enum): ++ TRIGGERED = "triggered" ++ DIRECT = "direct" ++ ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++SYNC_DIRECT_TASKS = defaultdict(list) ++ASYNC_TASKS = defaultdict(list) ++SYNC_TRIGGERED_TASKS = defaultdict(list) ++ ++# 四种采集方式组合:同步+直接采集、同步+触发采集、异步+直接采集、异步+触发采集 ++# 对应四种任务队列,异步队列可以合并一起并行执行 ++# 触发式采集执行前会阻塞任务,若收到采集信号才会执行 ++TASK_MODE_MAP = { ++ (CollectMode.SYNC, CollectType.DIRECT): SYNC_DIRECT_TASKS, ++ (CollectMode.ASYNC, CollectType.DIRECT): ASYNC_TASKS, ++ (CollectMode.SYNC, CollectType.TRIGGERED): SYNC_TRIGGERED_TASKS, ++ (CollectMode.ASYNC, CollectType.TRIGGERED): ASYNC_TASKS, ++} ++ ++ ++def process_decorated_func(output: Any, func: Callable): ++ result = ExecuteResult() ++ try: ++ result.output = func(output) ++ result.err_msg = "" ++ result.status_code = 0 ++ except Exception as e: ++ print(traceback.format_exc()) ++ result.err_msg = str(e) ++ result.output = "" ++ result.status_code = -1 ++ return result ++ ++ ++def period_task( ++ cmd: str, ++ tag: str = None, ++ delay: int = 0, ++ sample_count: int = 1, ++ interval: int = 0, ++ collect_mode: CollectMode = CollectMode.SYNC, ++ collect_type: CollectType = CollectType.DIRECT, ++): ++ """ ++ 周期采集任务 ++ cmd: 命令字符串 ++ tag: 任务标签 ++ delay: 延迟采集任务,等业务趋于平稳后采集,单位s ++ sample_count: 周期任务采集次数 ++ interval: 周期任务采集时间间隔,单位s ++ collect_mode: 采集任务的模式,有同步和异步两种 ++ collect_type: 采集类型,有直接采集(业务持续运行状态)和触发式采集(通过benchmark压测的状态) ++ """ ++ ++ def decorator(func): ++ file = inspect.getfile(func) ++ ++ @wraps(func) ++ def wrapper(ssh_client): ++ result = ExecuteResult() ++ if delay > 0: ++ sleep(delay) ++ ++ if sample_count <= 0 or sample_count >= MAX_SAMPLE_COUNT: ++ raise ValueError( ++ f"Invalid sample count {sample_count} for peroid task." ++ ) ++ ++ if interval <= 0 or interval >= MAX_SAMPLE_INTERVAL: ++ raise ValueError(f"Invalid sample interval {interval} for peroid task.") ++ ++ if ( ++ collect_mode == CollectMode.ASYNC ++ and collect_type == CollectType.TRIGGERED ++ ): ++ logging.info( ++ f"task {func.__name__} is a triggered event, waiting for fifo signal ..." ++ ) ++ event_status = trigger_event_listener.wait() ++ if event_status == TriggerStatus.CLOSE: ++ logging.info( ++ f"task {func.__name__} waiting for trigger signal timeout" ++ ) ++ result.status_code = -1 ++ result.err_msg = ( ++ f"task {func.__name__} waiting for trigger signal timeout" ++ ) ++ return result ++ ++ logging.info( ++ f"period task {func.__name__} running, it will take {(sample_count - 1) * interval}s ..." ++ ) ++ all_result = [] ++ ++ for cnt in range(sample_count): ++ cmd_result = ssh_client.run_cmd(cmd) ++ if cmd_result.status_code == 0: ++ all_result.append(cmd_result.output) ++ else: ++ all_result.append(None) ++ logging.warning( ++ f"failed to execute peroid task {func.__name__}, reason is {cmd_result.err_msg}" ++ ) ++ if cnt != sample_count - 1: ++ sleep(interval) ++ ++ if len(all_result) <= 0: ++ result.status_code = -1 ++ result.err_msg = f"no data collected for peroid task {func.__name__}" ++ else: ++ processed_result = process_decorated_func(all_result, func) ++ result.status_code = 0 ++ result.output = processed_result ++ logging.info(f"task {func.__name__} finished!") ++ ++ return result ++ ++ TASK_MODE_MAP[(collect_mode, collect_type)][file].append( ++ {"func": wrapper, "tag": tag} ++ ) ++ return wrapper ++ ++ return decorator ++ ++ ++def snapshot_task( ++ cmd: str, ++ tag: str = None, ++ collect_mode: CollectMode = CollectMode.SYNC, ++ collect_type: CollectType = CollectType.DIRECT, ++): ++ def decorator(func): ++ file = inspect.getfile(func) ++ ++ @wraps(func) ++ def wrapper(ssh_client): ++ result = ExecuteResult() ++ if ( ++ collect_mode == CollectMode.ASYNC ++ and collect_type == CollectType.TRIGGERED ++ ): ++ logging.info( ++ f"task {func.__name__} is a triggered event, waiting for fifo signal ..." ++ ) ++ event_status = trigger_event_listener.wait() ++ if event_status == TriggerStatus.CLOSE: ++ logging.info( ++ f"task {func.__name__} waiting for trigger signal timeout" ++ ) ++ result.status_code = -1 ++ result.err_msg = ( ++ f"task {func.__name__} waiting for trigger signal timeout" ++ ) ++ return result ++ ++ logging.info(f"task {func.__name__} running ...") ++ ++ cmd_result = ssh_client.run_cmd(cmd) ++ if cmd_result.status_code == 0: ++ processed_result = process_decorated_func(cmd_result.output, func) ++ result.status_code = cmd_result.status_code ++ result.output = processed_result ++ logging.info(f"task {func.__name__} finished!") ++ else: ++ result = cmd_result ++ ++ return result ++ ++ TASK_MODE_MAP[(collect_mode, collect_type)][file].append( ++ {"func": wrapper, "tag": tag} ++ ) ++ return wrapper ++ ++ return decorator ++ ++ ++def get_registered_module_tasks( ++ module: ModuleType, ++ collect_mode: CollectMode = CollectMode.SYNC, ++ collect_type: CollectType = CollectType.DIRECT, ++): ++ if not isinstance(module, ModuleType) or not hasattr(module, "__file__"): ++ raise RuntimeError( ++ f"module {module.__name__} has no attr __file__, maybe it is a built-in module" ++ ) ++ caller_file = module.__file__ ++ return TASK_MODE_MAP[(collect_mode, collect_type)].get(caller_file, []) ++ ++ ++def get_registered_modules_tasks( ++ modules: list[ModuleType], ++ collect_mode: CollectMode = CollectMode.SYNC, ++ collect_type: CollectType = CollectType.DIRECT, ++): ++ task_list = [] ++ for module in modules: ++ task_list.extend( ++ get_registered_module_tasks(module, collect_mode, collect_type) ++ ) ++ return task_list +diff --git a/copilot-tune/src/utils/common.py b/copilot-tune/src/utils/common.py +new file mode 100644 +index 0000000..58a07c4 +--- /dev/null ++++ b/copilot-tune/src/utils/common.py +@@ -0,0 +1,95 @@ ++ ++from typing import Any ++from tabulate import tabulate ++ ++ ++def display_banner(): ++ try: ++ import pyfiglet ++ banner = pyfiglet.figlet_format("EulerCopilot v1.0", font="slant") ++ print(banner) ++ except ImportError: ++ print("EulerCopilot v1.0") ++ ++ ++def truncate_string(s, max_length=30): ++ """ ++ 截断字符串,如果超过指定长度则在末尾加上... ++ """ ++ s = str(s) ++ if len(s) > max_length: ++ return s[: max_length - 3] + "..." ++ return s ++ ++ ++def flatten_dict(d, parent_key=None): ++ """ ++ 递归地将嵌套字典扁平化,键路径以 list[str] 形式存储 ++ """ ++ items = [] ++ if parent_key is None: ++ parent_key = [] ++ for k, v in d.items(): ++ new_key = parent_key + [k] # 将当前键添加到路径列表中 ++ if isinstance(v, dict): ++ items.extend(flatten_dict(v, new_key)) # 递归处理嵌套字典 ++ else: ++ items.append((new_key, v)) # 保存键路径和值 ++ return items ++ ++ ++def preview_data(data: dict, preview_nums: int = 5): ++ """ ++ 将扁平化的字典转换为列表,每个元素是一个元组 ++ """ ++ flattened = flatten_dict(data) ++ if len(flattened) > 2 * preview_nums: ++ # 确定省略号部分的键路径长度 ++ ellipsis_list = ["..."] * len(flattened[0][0]) ++ flattened = ( ++ flattened[:preview_nums] ++ + [(ellipsis_list, "...")] ++ + flattened[-preview_nums:] ++ ) ++ result = [] ++ for key_list, value in flattened: ++ # 将键路径和值都转换为字符串并截断 ++ truncated_keys = [truncate_string(k) for k in key_list] ++ truncated_value = truncate_string(value) ++ result.append(truncated_keys + [truncated_value]) # 将键路径和值组合成一个列表 ++ return result ++ ++ ++def display_metrics( ++ metric_data: dict, ++ headers: list[str] = ["metric_name", "metric_value"], ++ title: str = "", ++ preview_nums: int = 5, ++): ++ if not isinstance(metric_data, dict): ++ raise TypeError(f"display metric_data only support dict data now!") ++ ++ table_str = tabulate( ++ preview_data(metric_data), ++ headers=headers, ++ tablefmt="grid", ++ ) ++ display_content = "\n".join([title, table_str]) ++ print(display_content) ++ ++ ++class ExecuteResult: ++ def __init__(self, status_code: int = -1, output: Any = None, err_msg: str = ""): ++ self.status_code = status_code ++ self.output = output ++ self.err_msg = err_msg ++ ++ def __dict__(self): ++ return { ++ "status_code": self.status_code, ++ "err_msg": self.err_msg, ++ "output": self.output, ++ } ++ ++ def __repr__(self): ++ return str(self.__dict__()) +diff --git a/copilot-tune/src/utils/config/__init__.py b/copilot-tune/src/utils/config/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/utils/config/app_config.py b/copilot-tune/src/utils/config/app_config.py +new file mode 100644 +index 0000000..f1fa6c2 +--- /dev/null ++++ b/copilot-tune/src/utils/config/app_config.py +@@ -0,0 +1,305 @@ ++import os ++import re ++from enum import Enum ++from string import Template ++from dataclasses import dataclass, asdict, field ++ ++from src.utils.constant import SCRIPTS_PATH ++from src.utils.shell_execute import SshClient ++from src.utils.config.global_config import env_config, param_config ++from src.utils.metrics import PerformanceMetric ++ ++# 某个app需要注册私有的模板方法,可存在这里 ++REGISTERED_TEMPLATE = {} ++ ++ ++class ExecuteMode(Enum): ++ REMOTE = "remote" ++ LOCAL = "local" ++ ++ ++ALL_EXECUTE_MODES = "|".join(mode.value for mode in ExecuteMode) ++EXECUTE_MODE_PATTERN = re.compile(rf"^\$EXECUTE_MODE:\s*({ALL_EXECUTE_MODES})?\s*(.*)") ++ ++ ++def default_scripts_dir(): ++ scripts_dir = os.path.abspath( ++ os.path.join(os.path.dirname(__file__), "..", "..", "..", "scripts") ++ ) ++ ++ # 检查原始目录是否存在 ++ if os.path.exists(scripts_dir) and os.path.isdir(scripts_dir): ++ return scripts_dir ++ else: ++ return SCRIPTS_PATH ++ ++ ++def app_template(key): ++ """ ++ 类修饰器,重写AppInterface方法后,通过该接口注册模板方法 ++ 用于某些特殊的应用与默认流程不同,需要重载处理流程的情况 ++ ++ 下面给一个重写mysql执行benchmark的实例: ++ @app_template("mysql") # 使用该注解之后就默认会使用该自定义方法处理benchmark的流程 ++ class MysqlTemplate(AppTemplate): ++ def benchmark(self, ssh_client): ++ exec_result = ssh_client.run_cmd(self.benchmark) ++ if exec_result.status_code == 0: ++ return exec_result.result ++ else: ++ return 0 ++ """ ++ ++ def decorator(cls): ++ REGISTERED_TEMPLATE[key] = cls ++ return cls ++ ++ return decorator ++ ++ ++# 根据kwargs填充shell命令模板,生成可用的shell命令 ++def shell_template(template_str, **kwargs): ++ processed_str = re.sub(r"\$(\d+)", r"$$\1", template_str) ++ template = Template(processed_str) ++ processed_template = template.substitute(**kwargs) ++ postprocessed_str = re.sub(r"\$\$(\d+)", r"$\1", processed_template) ++ return postprocessed_str ++ ++ ++@dataclass ++class AppMetaConfig: ++ app_name: str ++ user: str ++ port: str ++ password: str ++ config_file: str ++ host_ip: str ++ host_port: int ++ SCRIPTS_DIR: str = field(default_factory=default_scripts_dir) ++ ++ ++class AppTemplate: ++ def __init__( ++ self, ++ ssh_client: SshClient, ++ app_name: str = "", ++ user: str = "", ++ port: str = "", ++ password: str = "", ++ config_file: str = "", ++ get_param_template: str = "", ++ set_param_template: str = "", ++ start_workload: str = "", ++ stop_workload: str = "", ++ recover_workload: str = "", ++ benchmark: str = "", ++ performance_metric: str = "" ++ ): ++ # 应用的基本配置,填充模板可能会用到 ++ self.meta_data = asdict( ++ AppMetaConfig( ++ app_name=app_name, ++ user=user, ++ port=port, ++ password=password, ++ config_file=config_file, ++ host_ip=ssh_client.host_ip, ++ host_port=ssh_client.host_port, ++ ) ++ ) ++ self.app_params = param_config.get(app_name) ++ self.system_params = param_config.get("system") ++ self.ssh_client = ssh_client ++ self.get_param_template = get_param_template ++ self.set_param_template = set_param_template ++ self.start_workload_cmd = start_workload ++ self.stop_workload_cmd = stop_workload ++ self.recover_workload_cmd = recover_workload ++ self.benchmark_cmd = benchmark ++ if app_name != "system": ++ try: ++ self.performance_metric = PerformanceMetric[performance_metric] ++ except KeyError: ++ supported_metrics = list(PerformanceMetric.__members__.keys()) ++ raise KeyError( ++ f"Performance metric '{performance_metric}' is not supported. " ++ f"Supported metrics are: {supported_metrics}" ++ ) ++ else: ++ self.performance_metric = PerformanceMetric["QPS"] ++ self.mode_map = { ++ ExecuteMode.REMOTE: ssh_client.run_cmd, ++ ExecuteMode.LOCAL: ssh_client.run_local_cmd, ++ } ++ ++ def extract_mode(self, cmd): ++ match = EXECUTE_MODE_PATTERN.match(cmd) ++ if match: ++ mode_str = match.group(1) ++ remaining_string = match.group(2).strip() ++ mode = ExecuteMode.REMOTE if mode_str is None else ExecuteMode(mode_str) ++ return self.mode_map[mode], remaining_string ++ else: ++ return self.mode_map[ExecuteMode.REMOTE], cmd ++ ++ def get_param(self, param_name): ++ if param_name in self.system_params: ++ get_param_template = self.system_params[param_name]["get"] ++ else: ++ get_param_template = self.get_param_template ++ if not self.get_param_template: ++ return None ++ run_cmd_func, cmd = self.extract_mode(get_param_template) ++ cmd = shell_template( ++ cmd, ++ param_name=param_name, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def set_param(self, param_name, param_value): ++ if param_name in self.system_params: ++ set_param_template = self.system_params[param_name]["set"] ++ else: ++ set_param_template = self.set_param_template ++ if not self.set_param_template: ++ return None ++ run_cmd_func, cmd = self.extract_mode(set_param_template) ++ cmd = shell_template( ++ cmd, ++ param_name=param_name, ++ param_value=param_value, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def start_workload(self): ++ if not self.start_workload_cmd: ++ return None ++ run_cmd_func, cmd = self.extract_mode(self.start_workload_cmd) ++ cmd = shell_template( ++ cmd, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def stop_workload(self): ++ if not self.stop_workload_cmd: ++ return None ++ run_cmd_func, cmd = self.extract_mode(self.stop_workload_cmd) ++ cmd = shell_template( ++ cmd, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def recover_workload(self): ++ if not self.recover_workload_cmd: ++ return None ++ run_cmd_func, cmd = self.extract_mode(self.recover_workload_cmd) ++ cmd = shell_template( ++ cmd, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def generate_set_command(self, param_name, param_value): ++ if param_name in self.system_params: ++ set_param_template = self.system_params[param_name]["set"] ++ else: ++ set_param_template = self.set_param_template ++ if not self.set_param_template: ++ return None ++ _, cmd = self.extract_mode(set_param_template) ++ ++ formatted_cmd = shell_template( ++ cmd, ++ param_name=param_name, ++ param_value=param_value, ++ **self.meta_data, ++ ) ++ return formatted_cmd ++ ++ def benchmark(self): ++ if not self.benchmark_cmd: ++ return None ++ run_cmd_func, cmd = self.extract_mode(self.benchmark_cmd) ++ cmd = shell_template( ++ cmd, ++ **self.meta_data, ++ ) ++ return run_cmd_func(cmd) ++ ++ def get_calculate_type(self): ++ if self.performance_metric == PerformanceMetric.DURATION or self.performance_metric == PerformanceMetric.RT: ++ # 耗时和响应时间等越小越好 ++ return -1 ++ else: ++ # QPS 和吞吐量等越大越好 ++ return 1 ++ ++ ++# 将配置文件反序列化成可执行的函数,例如: ++# 下面就是一个实例化mysql应用 ++# application = AppInterface() ++# application.mysql.start_workload() ++class AppInterface: ++ _instance = None ++ _initialized = False ++ ++ def __new__(cls, ssh_client: SshClient): ++ if cls._instance is None: ++ cls._instance = super(AppInterface, cls).__new__(cls) ++ return cls._instance ++ ++ def __init__(self, ssh_client: SshClient): ++ # 防止重复初始化 ++ if not getattr(self, "_initialized", False): ++ self._config = env_config.get("app_config") ++ self._instances = {} ++ self.ssh_client = ssh_client # 保存 ssh_client ++ self._initialize_instances() ++ self.__class__._initialized = True ++ ++ def _initialize_instances(self): ++ for app_name, app_config in self._config.items(): ++ # 优先使用 REGISTERED_TEMPLATE 中的类 ++ cls = REGISTERED_TEMPLATE.get(app_name, AppTemplate) ++ self._instances[app_name] = cls( ++ app_name=app_name, ssh_client=self.ssh_client, **app_config ++ ) ++ ++ def __contains__(self, param_name): ++ pass ++ ++ def __getattr__(self, item): ++ return self.get(item) ++ ++ def get(self, item): ++ if item in self._instances: ++ return self._instances[item] ++ raise AttributeError(f"'AppInterface' object has no attribute '{item}'") ++ ++ ++if __name__ == "__main__": ++ class SshClient: ++ def __init__(self): ++ pass ++ ++ def run_cmd(self, cmd): ++ print(cmd) ++ ++ ++ ssh_client = SshClient() ++ app_interface = AppInterface(ssh_client) ++ app = app_interface.mysql ++ sys = app_interface.system ++ ++ app.get_param("hello") ++ app.set_param("hello", 1) ++ app.start_workload() ++ app.stop_workload() ++ app.benchmark() ++ ++ sys.get_param("aaa") ++ sys.set_param("bbb", 2) +diff --git a/copilot-tune/src/utils/config/global_config.py b/copilot-tune/src/utils/config/global_config.py +new file mode 100644 +index 0000000..17b0595 +--- /dev/null ++++ b/copilot-tune/src/utils/config/global_config.py +@@ -0,0 +1,93 @@ ++import os ++import json ++import yaml ++from typing import Any, Optional, Dict, List ++from pathlib import Path ++ ++from src.utils.constant import CONFIG_PATH, KNOWLEDGE_PATH ++ ++ ++class EnvironConfig: ++ def __init__(self, config_dir: str): ++ """ ++ 初始化配置加载器,读取指定目录下的所有 .json / .yaml / .yml 文件 ++ ++ :param config_dir: 配置文件所在目录 ++ """ ++ self.config_dir = Path(config_dir).resolve() ++ if not self.config_dir.exists() or not self.config_dir.is_dir(): ++ raise FileNotFoundError(f"Config directory not found: {config_dir}") ++ ++ self._configs: Dict[str, dict] = {} ++ ++ # 加载所有支持的配置文件 ++ self._load_all_config_files() ++ ++ def _load_all_config_files(self): ++ for file in self.config_dir.iterdir(): ++ if file.suffix.lower() in (".json", ".yaml", ".yml"): ++ name = file.stem # 不带扩展名的文件名 ++ try: ++ with open(file, "r", encoding="utf-8") as f: ++ if file.suffix == ".json": ++ self._configs[name] = json.load(f) ++ else: ++ self._configs[name] = yaml.safe_load(f) or {} ++ except Exception as e: ++ print( ++ f"[Warning] Failed to load config file: {file.name}, error: {e}" ++ ) ++ ++ def get(self, key_path: str, default: Optional[Any] = None) -> Any: ++ """ ++ 获取配置值,使用点号分隔的路径访问嵌套字段 ++ ++ :param key_path: 如 "filename" 或 "filename.section.key" ++ :param default: 如果找不到返回的默认值 ++ :return: 配置值或默认值 ++ """ ++ keys = key_path.split(".") ++ filename = keys[0] ++ ++ config = self._configs.get(filename) ++ if config is None: ++ return default ++ ++ if len(keys) == 1: ++ return config ++ ++ current = config ++ for key in keys[1:]: ++ if isinstance(current, dict) and key in current: ++ current = current[key] ++ else: ++ return default ++ return current ++ ++ def find_key(self, target_key: str) -> List[str]: ++ """ ++ 查找某个 key 在哪些配置文件中作为顶层 key 出现过 ++ :param target_key: 要查找的 key 名称 ++ :return: 包含这个 key 的所有文件名列表(不带扩展名) ++ """ ++ result = [] ++ for filename, config in self._configs.items(): ++ if isinstance(config, dict) and target_key in config: ++ result.append(filename) ++ return result ++ ++ ++DEFAULT_CONFIG_PATH = os.path.abspath( ++ os.path.join(os.path.dirname(__file__), "..", "..", "..", "config") ++) ++if not os.path.exists(DEFAULT_CONFIG_PATH) or not os.path.isdir(DEFAULT_CONFIG_PATH): ++ DEFAULT_CONFIG_PATH = CONFIG_PATH ++ ++PARAMS_PATH = os.path.abspath( ++ os.path.join(os.path.dirname(__file__), "..", "..", "knowledge_base", "knob_params") ++) ++if not os.path.exists(PARAMS_PATH) or not os.path.isdir(PARAMS_PATH): ++ PARAMS_PATH = os.path.join(KNOWLEDGE_PATH, "knob_params") ++ ++env_config = EnvironConfig(DEFAULT_CONFIG_PATH) ++param_config = EnvironConfig(PARAMS_PATH) +diff --git a/copilot-tune/src/utils/constant.py b/copilot-tune/src/utils/constant.py +new file mode 100644 +index 0000000..2dc1e7f +--- /dev/null ++++ b/copilot-tune/src/utils/constant.py +@@ -0,0 +1,6 @@ ++CONFIG_PATH = "/etc/euler-copilot-tune/config" ++KNOWLEDGE_PATH = "/etc/euler-copilot-tune/knowledge_base" ++ENV_CONFIG_PATH = "/etc/euler-copilot-tune/config/.env.yaml" ++KNOB_RAG_CONFIG_PATH = "/etc/euler-copilot-tune/config/knob_rag_config.json" ++OPTIMIZE_CONFIG_PATH = "/etc/euler-copilot-tune/config/optimize_config.yaml" ++SCRIPTS_PATH = "/etc/euler-copilot-tune/scripts" +diff --git a/copilot-tune/src/utils/json_repair.py b/copilot-tune/src/utils/json_repair.py +new file mode 100644 +index 0000000..19b8575 +--- /dev/null ++++ b/copilot-tune/src/utils/json_repair.py +@@ -0,0 +1,15 @@ ++import json ++from typing import Dict ++ ++# TO implement more gernel repair: todo ++def json_repair( ++ json_str: str ++) -> Dict: ++ json_start = json_str.find('{') ++ json_end = json_str.rfind('}') + 1 ++ json_str = json_str[json_start:json_end] ++ try: ++ json_data = json.loads(json_str) ++ except json.decoder.JSONDecodeError as e: ++ raise RuntimeError(f"failed to parse json, raw json_str is {json_str}") ++ return json_data +\ No newline at end of file +diff --git a/copilot-tune/src/utils/llm.py b/copilot-tune/src/utils/llm.py +new file mode 100644 +index 0000000..5691dc7 +--- /dev/null ++++ b/copilot-tune/src/utils/llm.py +@@ -0,0 +1,33 @@ ++import re ++from typing import List ++import requests ++from src.config import config ++from langchain_openai import ChatOpenAI ++ ++requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) ++requests.Session.verify = False ++ ++def get_llm_response(prompt: str) -> str: ++ client = ChatOpenAI( ++ openai_api_key=config["LLM_KEY"], ++ openai_api_base=config["LLM_URL"], ++ model_name=config["LLM_MODEL_NAME"], ++ tiktoken_model_name="cl100k_base", ++ max_tokens=config["LLM_MAX_TOKENS"], ++ streaming=True ++ ) ++ result = client.invoke(input=prompt) ++ return re.sub(r".*?", "", result.content, flags=re.DOTALL) ++ ++ ++def get_embedding(text: str) -> List[float]: ++ data = { ++ "model": config["REMOTE_EMBEDDING_MODEL_NAME"], ++ "texts": [text] ++ } ++ res = requests.post(url=config['REMOTE_EMBEDDING_ENDPOINT'], json=data, verify=False) ++ if res.status_code != 200: ++ return [] ++ return res.json()[0] ++ ++ +\ No newline at end of file +diff --git a/copilot-tune/src/utils/manager/__init__.py b/copilot-tune/src/utils/manager/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/utils/manager/task_manager.py b/copilot-tune/src/utils/manager/task_manager.py +new file mode 100644 +index 0000000..65f3d31 +--- /dev/null ++++ b/copilot-tune/src/utils/manager/task_manager.py +@@ -0,0 +1,185 @@ ++import logging ++ ++ ++from types import ModuleType ++from tabulate import tabulate ++from src.utils.shell_execute import SshClient ++from src.utils.thread_pool import serial_task_manager, thread_pool_manager ++from src.utils.collector.collector_trigger import TriggerEventListener, TriggerStatus ++from src.utils.collector.metric_collector import ( ++ get_registered_module_tasks, ++ get_registered_modules_tasks, ++ CollectMode, ++ CollectType, ++) ++ ++MAX_TASK_TIMEOUT = 300 ++triggered_event_listener = TriggerEventListener() ++ ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++def wait_for_signal(): ++ # waiting状态会阻塞程序,close状态和triggered状态是立即返回的 ++ event_status = triggered_event_listener.wait() ++ ++ if event_status == TriggerStatus.CLOSE: ++ logging.info(f"[TaskManager] waiting for trigger signale timeout, skip tasks") ++ return False ++ return True ++ ++ ++class AbstractTaskManager: ++ ++ def __init__(self, ssh_client: SshClient, timeout: int = 60): ++ self.ssh_client = ssh_client ++ self.timeout = timeout ++ ++ def _wrap_collector_task(self, tasks: list): ++ return [ ++ (task["func"], (self.ssh_client,), {"tag": task.pop("tag", "default_tag")}) ++ for task in tasks ++ ] ++ ++ def display_stats(self, collect_mode: CollectMode): ++ if not self.modules: ++ return ++ ++ logging.info(f"[{collect_mode.name} scheduler] collecting following metrics...") ++ modules_name = { ++ str(module.__name__).split(".")[-1]: module for module in self.modules ++ } ++ rows = [] ++ for module_name, module in modules_name.items(): ++ for collect_type in [CollectType.TRIGGERED, CollectType.DIRECT]: ++ task_list = get_registered_module_tasks( ++ module, collect_mode, collect_type ++ ) ++ for task in task_list: ++ rows.append( ++ [ ++ module_name, ++ task["func"].__name__, ++ collect_type.name, ++ collect_mode.name, ++ ] ++ ) ++ ++ if not rows: ++ return ++ table_str = tabulate( ++ rows, ++ headers=["module_name", "task_name", "collect_type", "collect_mode"], ++ tablefmt="grid", ++ ) ++ print("\n" + table_str + "\n") ++ ++ ++# 给定注册任务的模块,获取对应任务 ++class SyncTasksManager(AbstractTaskManager): ++ def __init__( ++ self, ++ ssh_client: SshClient, ++ modules: list[ModuleType], ++ timeout: int = 60, ++ ): ++ super().__init__(ssh_client=ssh_client, timeout=timeout) ++ self.modules = modules ++ self.direct_collect_tasks = get_registered_modules_tasks( ++ modules=modules, ++ collect_mode=CollectMode.SYNC, ++ collect_type=CollectType.DIRECT, ++ ) ++ self.triggered_collect_tasks = get_registered_modules_tasks( ++ modules=modules, ++ collect_mode=CollectMode.SYNC, ++ collect_type=CollectType.TRIGGERED, ++ ) ++ ++ def run(self): ++ self.display_stats(CollectMode.SYNC) ++ direct_tasks = [] ++ triggered_tasks = [] ++ ++ if len(self.direct_collect_tasks) > 0: ++ # 优先执行直接采集的任务 ++ serial_task_manager.add_batch( ++ self._wrap_collector_task(self.direct_collect_tasks) ++ ) ++ ++ serial_task_manager.run_all_tasks() ++ direct_tasks = serial_task_manager.get_all_results() ++ ++ if len(self.triggered_collect_tasks) > 0 and wait_for_signal(): ++ serial_task_manager.add_batch( ++ self._wrap_collector_task(self.triggered_collect_tasks) ++ ) ++ ++ serial_task_manager.run_all_tasks() ++ ++ triggered_tasks = serial_task_manager.get_all_results() ++ ++ return direct_tasks + triggered_tasks ++ ++ ++class AsyncTaskManager(AbstractTaskManager): ++ ++ def __init__( ++ self, ssh_client: SshClient, modules: list[ModuleType], timeout: int = 60 ++ ): ++ super().__init__(ssh_client=ssh_client, timeout=timeout) ++ self.modules = modules ++ self.collect_tasks = get_registered_modules_tasks( ++ modules=modules, collect_mode=CollectMode.ASYNC ++ ) ++ ++ def run(self): ++ self.display_stats(CollectMode.ASYNC) ++ thread_pool_manager.add_batch(self._wrap_collector_task(self.collect_tasks)) ++ ++ thread_pool_manager.run_all_tasks() ++ return thread_pool_manager.get_all_results() ++ ++ ++class TaskManager: ++ def __init__( ++ self, ++ ssh_client: SshClient, ++ modules: list[ModuleType], ++ timeout: int = 60, ++ global_trigger_mode: bool = False, ++ debug: bool = False, ++ ): ++ self.global_trigger_mode = global_trigger_mode ++ self.debug = debug ++ ++ if global_trigger_mode: ++ logging.info(f"using global trigger mode") ++ ++ self.sync_task_manager = SyncTasksManager( ++ ssh_client=ssh_client, modules=modules, timeout=timeout ++ ) ++ self.async_task_manager = AsyncTaskManager( ++ ssh_client=ssh_client, modules=modules, timeout=timeout ++ ) ++ ++ def run(self): ++ sync_result = self.sync_task_manager.run() ++ async_result = self.async_task_manager.run() ++ task_results = sync_result + async_result ++ ++ if self.debug: ++ for task_result in task_results: ++ if task_result.status_code != 0 or task_result.result.status_code: ++ print(task_result.result.output) ++ ++ collect_result = {} ++ for task_result in task_results: ++ if task_result.status_code == 0 and task_result.result.status_code == 0: ++ collect_result[task_result.tag] = task_result.result.output.output ++ else: ++ logging.warning(f"failed to collect {task_result.tag}") ++ print(task_result.result.err_msg) ++ return collect_result +diff --git a/copilot-tune/src/utils/metrics.py b/copilot-tune/src/utils/metrics.py +new file mode 100644 +index 0000000..b99edcc +--- /dev/null ++++ b/copilot-tune/src/utils/metrics.py +@@ -0,0 +1,16 @@ ++from enum import Enum ++ ++ ++class PerformanceMetric(Enum): ++ QPS = "QPS:每秒查询请求数(Queries Per Second),单位:次/秒" ++ RT = "RT:响应时间(Response Time),即从请求发出到收到响应的时间,单位:毫秒" ++ THROUGHPUT = "吞吐量:系统在单位时间内处理的请求数量或数据量,单位:请求数/秒 或 字节/秒" ++ DURATION = "任务执行耗时:应用执行单次任务的耗时,单位:秒" ++ BANDWIDTH = "表示单位时间内Ceph存储系统传输的数据量,衡量吞吐能力,单位通常为MB/s或GB/s" ++ IOPS = "表示每秒钟I/O请求次数,衡量处理小型请求的能力,单位为次/秒" ++ ++# 示例使用 ++if __name__ == "__main__": ++ for metric in PerformanceMetric: ++ print(metric.value) ++ print("-" * 50) +diff --git a/copilot-tune/src/utils/rag/__init__.py b/copilot-tune/src/utils/rag/__init__.py +new file mode 100644 +index 0000000..e69de29 +diff --git a/copilot-tune/src/utils/rag/knob_rag.py b/copilot-tune/src/utils/rag/knob_rag.py +new file mode 100644 +index 0000000..4201a4f +--- /dev/null ++++ b/copilot-tune/src/utils/rag/knob_rag.py +@@ -0,0 +1,208 @@ ++import json ++import logging ++import os ++import pickle ++import re ++from typing import Any, Dict, Tuple ++ ++import faiss ++import numpy as np ++from sklearn.preprocessing import normalize ++from tqdm import tqdm ++ ++from src.utils.llm import get_llm_response, get_embedding ++ ++logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') ++ ++ ++class KnobRag: ++ def __init__(self, config_path: str, bottle_neck: str, application: str, system_report: str): ++ self.bottle_neck = bottle_neck ++ self.application = application ++ self.system_report = system_report ++ self.config = self.load_config(config_path) ++ self.topk: int = self.config.get("topk", 10) ++ self.threshold: float = self.config.get("threshold", 0.5) ++ ++ def load_config( ++ self, ++ config_path: str ++ ) -> Dict[str, Any]: ++ try: ++ with open(config_path, "r", encoding="utf-8") as f: ++ config = json.load(f) ++ return config ++ except FileNotFoundError: ++ logging.error(f"配置文件 {config_path} 未找到。") ++ raise Exception(f"配置文件 {config_path} 未找到。") ++ except json.JSONDecodeError: ++ logging.error(f"配置文件 {config_path} 格式错误。") ++ raise Exception(f"配置文件 {config_path} 格式错误。") ++ ++ def get_query_list( ++ self, ++ ) -> list: ++ # prompt = f""" ++ # 你是一个经验丰富的linux故障分析专家,你的任务是根据给定的系统分析报告,分析出当前系统存在的问题。 ++ # 根据系统分析报告给出的结果,你需要做如下两件事情: ++ # 1.分析系统报告,根据系统报告判断哪些方面有问题,描述该问题可能产生的原因 ++ # 2.输出有问题的情况,针对没有潜在问题和性能瓶颈的方面请不要输出,包括对系统的建议也不要输出 ++ # 分析的结果用list格式输出,严格按照一行一个结果的格式,以数字开头,不要添加额外的输入语句,不要换行,一条问题写一行,每条结果前面加上数字编号后面跟上输出结果。 ++ # 请注意仅仅只输出系统中有问题的指标,如果该场景没有问题也没有明显瓶颈,就不要输出在结果中,另外针对系统的建议也不需要输出。 ++ # 系统分析报告是:{self.system_report} ++ # """ ++ prompt = f""" ++ # CONTEXT # ++ 当前linux系统性能分析报告是: ++ {self.system_report} ++ ++ # OBJECTIVE # ++ 请根据给定的系统性能分析报告,分析出当前系统存在的性能问题。 ++ 要求: ++ 1.分析系统报告,根据系统报告判断哪些方面有问题,描述该问题可能产生的原因 ++ 2.输出有问题的情况,针对没有潜在问题和性能瓶颈的方面请不要输出,包括对系统的建议也不要输出 ++ ++ # STYLE # ++ 你是一个经验丰富的linux故障分析专家,你的回答应该逻辑严谨、表述客观、简洁易懂、条理清晰 ++ ++ # Tone # ++ 你应该尽可能秉承严肃、认真、严谨的态度 ++ ++ # AUDIENCE # ++ 你的答案将会是其他系统运维专家的重要参考意见,请尽可能提供真实有用的信息,不要胡编乱造。 ++ ++ # RESPONSE FORMAT # ++ 分析的结果用list格式输出,严格按照一行一个结果的格式,以数字开头,不要添加额外的输入语句,不要换行,一条问题写一行,每条结果前面加上数字编号后面跟上输出结果。 ++ ++ """ ++ ++ response = get_llm_response(prompt) ++ pattern = r"^\s*[\d\.|-].*" ++ matches = re.findall(pattern, response, re.MULTILINE) ++ return matches if matches else [] ++ ++ # 构建索引 ++ def build_index( ++ self, ++ file_name: str, ++ ) -> Tuple[faiss.IndexFlatIP, list]: ++ docs = [] ++ # with open(f"{file_name}.jsonl", "r", encoding="utf-8") as f: ++ current_file_path = os.path.abspath(__file__) ++ current_dir_path = os.path.dirname(current_file_path) ++ file_name = f"{file_name}.jsonl" ++ config_path = os.path.join(current_dir_path, '..', '..', 'knowledge_base', 'optimize', 'parameter', file_name) ++ with open(config_path, "r", encoding="utf-8") as f: ++ for line in f.readlines(): ++ docs.append(json.loads(line)) ++ ++ index_path = f"{file_name}_index.pkl" ++ if os.path.exists(index_path): ++ print(f"Detect cached index, read index from {index_path} ...") ++ with open(index_path, 'rb') as file: ++ index = pickle.load(file) ++ print(type(index), type(docs)) ++ return index, docs ++ ++ embeddings = [] ++ for doc in tqdm(docs, desc=f"Building index for {file_name}..."): ++ query_embedding = get_embedding(doc["content"]) ++ embeddings.append(query_embedding) ++ normalized_embeddings = normalize(np.array(embeddings).astype('float32')) ++ d = len(embeddings[0]) ++ index = faiss.IndexFlatIP(d) ++ index.add(normalized_embeddings) ++ ++ with open(index_path, 'wb') as file: ++ pickle.dump(index, file) ++ ++ print(type(index), type(docs)) ++ ++ return index, docs ++ ++ # 召回top5且阈值大于0.6的样本 ++ # 返回值类型? ++ def retrieve( ++ self, ++ index: faiss.Index, ++ docs: list, ++ query_list: list, ++ ) -> list: ++ result = {} ++ unique = set() ++ ++ for query_data in query_list: ++ query_embedding = get_embedding(query_data) ++ D, I = index.search(normalize(np.array(query_embedding).astype('float32').reshape(1, -1)), self.topk) ++ ++ for idx, score in zip(I[0], D[0]): ++ if score > self.threshold: ++ if idx not in unique: ++ result[idx] = score ++ result[idx] = max(result[idx], score) ++ result = [docs[item[0]] for item in sorted(list(result.items()), key=lambda x: x[1], reverse=True)] ++ print(type(result[:self.topk])) ++ return result[:self.topk] ++ ++ def run(self) -> list: ++ query_list = self.get_query_list() ++ if self.bottle_neck.lower() == "cpu": ++ query_list.append("CPU密集型任务") ++ if not query_list: ++ return [] ++ print("query is :{}".format(query_list)) ++ ++ # 分系统和应用两组分别召回前5个匹配的参数 ++ system_index, system_docs = self.build_index("system") ++ system_result = self.retrieve(system_index, system_docs, query_list) ++ if self.application.lower() != "none": ++ self.application = self.application.lower() ++ application_index, application_docs = self.build_index(self.application) ++ application_result = self.retrieve(application_index, application_docs, query_list) ++ else: ++ application_result = [] ++ final_result = system_result + application_result ++ return [x["param_name"] for x in final_result] ++ ++# if __name__ == "__main__": ++# system_report = """一、CPU性能分析 ++ ++# 1. 负载分析:过去1分钟内系统负载迅速增加,表明系统对CPU性能的需求可能提高。而过去10分钟内负载稳定,但呈上升趋势,这进一步证实了未来可能对CPU性能的需求会上升。 ++ ++# 2. CPU利用率:当前系统用户态和内核态CPU利用率分别为0.0和0.0002,总体利用率为0.00019999999999997797。这表明CPU利用率非常低,系统在CPU方面没有明显的性能瓶颈。 ++ ++# 二、内存性能分析 ++ ++# 系统内存使用率为0.0288,总体来说内存使用较为充足。然而,系统可用交换空间为0.0,低于预设阈值。这表明系统可能很快会耗尽虚拟内存,需要减少运行程序的数量和大小,或增加交换空间来避免内存不足的问题。 ++ ++# 三、网络I/O性能分析 ++ ++# 1. 网络流量:平均每秒接收和发送数据包分别为5.00和1.00,主要来自ens3接口。接收和发送的数据量较低,分别为0.86KB和0.05KB。 ++ ++# 2. 接口利用率:所有接口利用率均为0.00,说明网络接口未达到性能瓶颈。 ++ ++# 综上,网络I/O性能良好,未对系统性能产生显著影响。 ++ ++# 四、磁盘性能分析 ++ ++# 1. 磁盘I/O:系统iowait值为0.0,表明磁盘I/O未导致进程等待。 ++ ++# 2. 磁盘利用率:所有磁盘利用率均为0.0,说明磁盘使用率极低,磁盘I/O不是系统性能的瓶颈。 ++ ++# 综合分析: ++ ++# 1. 根据已知信息,系统性能瓶颈点为disk I/O,但根据我们的分析,磁盘性能指标显示磁盘I/O并没有对系统性能产生显著影响。 ++ ++# 2. CPU和内存利用率较低,网络性能良好,这些维度的性能指标均未达到瓶颈。 ++ ++# 3. 建议关注内存可用交换空间,可能需要增加交换空间或调整运行程序以避免内存不足。 ++ ++# 4. 考虑到过去1分钟系统负载迅速增加,建议进一步观察系统负载变化,以便对CPU性能需求进行更准确的评估。 ++ ++# 本报告基于当前采集的系统指标进行分析,如有需要,请持续关注系统性能变化,以便及时调整优化措施。""" ++# # config_path = Path('src')/'knowledge_base' / 'optimize' / 'parameter' / 'mysql.jsonl' ++# config_path = r"D:\github\tuning\src\utils\rag\rag_config.json" ++# rag = KnobRag(config_path=config_path, bottle_neck="CPU", application="mysql", system_report=system_report) ++# ret = rag.run() ++# print(ret) ++# ret=['vm.swappiness', 'vm.min_free_kbytes', 'vm.dirty_expire_centisecs', 'vm.overcommit_ratio', 'vm.dirty_background_ratio', 'vm.dirty_bytes', 'vm.dirty_background_bytes', 'kernel.shmmax', 'kernel.shmall', 'vm.drop_caches'] +diff --git a/copilot-tune/src/utils/shell_execute.py b/copilot-tune/src/utils/shell_execute.py +new file mode 100644 +index 0000000..0b98763 +--- /dev/null ++++ b/copilot-tune/src/utils/shell_execute.py +@@ -0,0 +1,179 @@ ++import inspect ++import logging ++import shlex ++import subprocess ++import time ++import traceback ++from collections import defaultdict ++from functools import wraps ++from types import ModuleType ++from typing import Callable ++ ++import paramiko ++ ++from src.utils.common import ExecuteResult ++ ++decorated_funcs = defaultdict(list) ++cmds_registry = defaultdict(list) ++ ++# 配置日志 ++logging.basicConfig( ++ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" ++) ++ ++ ++def retryable(max_retries: int = 3, delay: int = 1): ++ def decorator(func): ++ @wraps(func) ++ def wrapper(self, *args, **kwargs): ++ retries = 0 ++ while retries < max_retries: ++ try: ++ return func(self, *args, **kwargs) ++ except Exception as e: ++ retries += 1 ++ print( ++ f"Attempt {retries} failed in function '{func.__name__}': {e}" ++ ) ++ if retries < max_retries: ++ print(f"Retrying in {delay} second(s)...") ++ time.sleep(delay) ++ else: ++ print( ++ f"Function '{func.__name__}' failed after {retries} attempts." ++ ) ++ raise ++ ++ return wrapper ++ ++ return decorator ++ ++ ++class SshClient: ++ def __init__( ++ self, ++ host_ip: str = "", ++ host_port: int = 22, ++ host_user: str = "root", ++ host_password: str = "", ++ max_retries: int = 0, ++ delay: float = 1.0, ++ ): ++ self.host_ip = host_ip ++ self.host_port = host_port ++ self.host_user = host_user ++ self.host_password = host_password ++ ++ self.max_retries = max_retries ++ self.delay = delay ++ ++ @retryable() ++ def run_cmd(self, cmd) -> ExecuteResult: ++ client = paramiko.SSHClient() ++ client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) ++ result = ExecuteResult() ++ try: ++ client.connect( ++ self.host_ip, self.host_port, self.host_user, self.host_password ++ ) ++ _, stdout, stderr = client.exec_command(cmd) ++ result.output = stdout.read().decode().strip() ++ result.err_msg = stderr.read().decode() ++ result.status_code = stdout.channel.recv_exit_status() ++ except Exception as e: ++ result.status_code = -1 ++ result.output = "" ++ result.err_msg = traceback.format_exc() ++ finally: ++ client.close() ++ return result ++ ++ @retryable() ++ def run_local_cmd(self, cmd): ++ result = ExecuteResult() ++ try: ++ # 使用 shlex.split 将命令字符串分割为参数列表 ++ args = shlex.split(cmd) ++ shell_result = subprocess.run( ++ args, ++ check=True, ++ stdout=subprocess.PIPE, ++ stderr=subprocess.PIPE, ++ text=True, ++ ) ++ result.output = shell_result.stdout.strip() ++ result.err_msg = shell_result.stderr.strip() ++ result.status_code = shell_result.returncode ++ except subprocess.CalledProcessError as e: ++ result.output = "" ++ result.err_msg = e.stderr.strip() ++ result.status_code = e.returncode ++ except Exception as e: ++ result.output = "" ++ result.err_msg = str(e) ++ result.status_code = -1 ++ return result ++ ++ @retryable() ++ def run_background_command(self, cmd) -> str: ++ """在后台运行命令并返回PID""" ++ full_cmd = f"nohup {cmd} > /dev/null 2>&1 & echo $!" ++ result = self.run_cmd(full_cmd) ++ pid = result.output ++ pid = pid.strip() ++ if not pid.isdigit(): ++ raise RuntimeError("Failed to get PID") ++ return pid ++ ++ ++def process_decorated_func( ++ result: ExecuteResult, func: Callable, *args, **kwargs ++): ++ try: ++ processed_result = func(result.output, *args, **kwargs) ++ result.output = processed_result ++ except Exception as e: ++ print(traceback.format_exc()) ++ result.err_msg = str(e) ++ return result ++ ++ ++def cmd_pipeline( ++ cmd: str = "", ++ tag: str = "default_tag", ++ parallel: bool = False, ++): ++ def decorator(func): ++ file = inspect.getfile(func) ++ ++ @wraps(func) ++ def wrapper(ssh_client, *args, **kwargs): ++ result = ssh_client.run_cmd(cmd) ++ if result.status_code == 0: ++ return process_decorated_func(result, func) ++ return result ++ ++ decorated_funcs[file].append( ++ {"func": wrapper, "tag": tag, "parallel": parallel} ++ ) ++ return wrapper ++ ++ return decorator ++ ++ ++def get_registered_cmd_funcs( ++ module: ModuleType, parallel: bool = True ++): ++ if not isinstance(module, ModuleType) or not hasattr(module, "__file__"): ++ raise RuntimeError( ++ f"module {module.__name__} has no attr __file__, maybe it is a built-in module" ++ ) ++ caller_file = module.__file__ ++ ++ registered_funcs = decorated_funcs.get(caller_file, []) ++ ++ func_list = [] ++ for func_info in registered_funcs: ++ if func_info["parallel"] == parallel: ++ func_list.append({"func": func_info["func"], "tag": func_info["tag"]}) ++ return func_list +diff --git a/copilot-tune/src/utils/thread_pool.py b/copilot-tune/src/utils/thread_pool.py +new file mode 100644 +index 0000000..5da042d +--- /dev/null ++++ b/copilot-tune/src/utils/thread_pool.py +@@ -0,0 +1,236 @@ ++import uuid ++import traceback ++import threading ++import concurrent.futures ++from typing import Any, Callable, Dict, Tuple, List, Union, Iterable ++ ++from src.utils.common import ExecuteResult ++ ++ ++class TaskResult: ++ def __init__( ++ self, ++ uuid: str, ++ func_name: str = "", ++ result: Any = None, ++ status_code: int = 0, ++ tag: str = "default_tag", ++ ): ++ self.uuid = uuid ++ self.func_name = func_name ++ self.result = result ++ self.status_code = status_code ++ self.tag = tag ++ ++ def as_dict(self): ++ return { ++ "uuid": self.uuid, ++ "func_name": self.func_name, ++ "result": self.result, ++ "status_code": self.status_code, ++ "tag": self.tag, ++ } ++ ++ def __repr__(self): ++ return str(self.as_dict()) ++ ++ ++""" ++ThreadPoll Manager, used for accelerate speed of performance collector command ++""" ++ ++ ++class ThreadPoolManager: ++ def __init__(self, max_workers: int = 5): ++ self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) ++ self.tasks: Dict[str, concurrent.futures.Future] = {} ++ self.all_results: List[TaskResult] = [] ++ self.pending: list[tuple[str, Callable, tuple, dict]] = [] ++ self.tag_map: dict = {} ++ self.task_meta: Dict[str, str] = {} ++ ++ # add a task to be run, every task will be asigned a task id ++ # user can query whether this task has been done by is_done method ++ def add_task(self, func: Callable, *args, **kwargs) -> str: ++ task_id = str(uuid.uuid4()) ++ self.pending.append((task_id, func, args, kwargs)) ++ self.task_meta[task_id] = func.__name__ ++ self.tag_map[task_id] = kwargs.pop("tag", "default_tag") ++ return task_id ++ ++ """ ++ batch submit tasks, for example: ++ def hello(): ++ return "hello" ++ ++ def add(x, y): ++ return x + y ++ ++ tasks = [ ++ hello, ++ (add, (1, 2), {}), ++ (add, (3, 4), {"z": 5}) ++ ] ++ """ ++ ++ def add_batch( ++ self, tasks: Iterable[Union[Callable, Tuple[Callable, Tuple, Dict]]] ++ ) -> List[str]: ++ uuids = [] ++ for task in tasks: ++ if callable(task): ++ task_id = self.add_task(task, tag="default") ++ elif isinstance(task, tuple): ++ # (func, args, kwargs) ++ func = task[0] ++ args = task[1] if len(task) > 1 else () ++ kwargs = task[2] if len(task) > 2 else {} ++ task_id = self.add_task(func, *args, **kwargs) ++ else: ++ raise ValueError(f"Unsupported task format: {task}") ++ uuids.append(task_id) ++ return uuids ++ ++ def add_multi_batch(self, *args) -> List[str]: ++ uuids_all = [] ++ for task in args: ++ if isinstance(task, list): ++ uuids_batch = self.add_batch(task) ++ else: ++ raise ValueError(f"Unsupported task format: {task}") ++ uuids_all.extend(uuids_batch) ++ return uuids_all ++ ++ def run_all_tasks(self) -> None: ++ for task_id, func, args, kwargs in self.pending: ++ future = self.executor.submit(func, *args, **kwargs) ++ self.tasks[task_id] = future ++ self.pending.clear() ++ ++ # get task result by id ++ def get_result(self, task_id: str) -> Any: ++ future = self.tasks.get(task_id) ++ if not future: ++ raise ValueError(f"No such task: {task_id}") ++ return future.result() ++ ++ # query whether task has been done ++ def is_done(self, task_id: str) -> bool: ++ future = self.tasks.get(task_id) ++ if not future: ++ raise ValueError(f"No such task: {task_id}") ++ return future.done() ++ ++ # wait until all task done ++ def wait_all(self) -> None: ++ concurrent.futures.wait(self.tasks.values()) ++ self.all_results.clear() ++ for task_id, future in self.tasks.items(): ++ func_name = self.task_meta.get(task_id, "unknown") ++ try: ++ result = future.result() ++ status_code = 0 ++ except Exception as e: ++ result = ExecuteResult( ++ status_code=-1, output="", err_msg=traceback.format_exc() ++ ) ++ status_code = -1 ++ self.all_results.append( ++ TaskResult( ++ task_id, func_name, result, status_code, self.tag_map[task_id] ++ ) ++ ) ++ ++ def get_all_results(self) -> List[TaskResult]: ++ self.wait_all() ++ self.tasks.clear() ++ return self.all_results ++ ++ ++class SerialTaskManager: ++ def __init__(self): ++ self.tasks: List[Tuple[str, Callable, Tuple, Dict]] = [] ++ self.all_results: List[TaskResult] = [] ++ self.tag_map: dict = {} ++ self.task_meta: Dict[str, str] = {} ++ ++ def add_task(self, func: Callable, *args, **kwargs) -> str: ++ task_id = str(uuid.uuid4()) ++ self.tasks.append((task_id, func, args, kwargs)) ++ self.task_meta[task_id] = func.__name__ ++ if "tag" in kwargs: ++ self.tag_map[task_id] = kwargs["tag"] ++ else: ++ self.tag_map[task_id] = "default_tag" ++ return task_id ++ ++ def add_batch( ++ self, tasks: Iterable[Union[Callable, Tuple[Callable, Tuple, Dict]]] ++ ) -> List[str]: ++ uuids = [] ++ for task in tasks: ++ if callable(task): ++ task_id = self.add_task(task, tag="default") ++ elif isinstance(task, tuple): ++ # (func, args, kwargs) ++ func = task[0] ++ args = task[1] if len(task) > 1 else () ++ kwargs = task[2] if len(task) > 2 else {} ++ task_id = self.add_task(func, *args, **kwargs) ++ else: ++ raise ValueError(f"Unsupported task format: {task}") ++ uuids.append(task_id) ++ return uuids ++ ++ def add_multi_batch(self, *args) -> List[str]: ++ uuids_all = [] ++ for task in args: ++ if isinstance(task, list): ++ uuids_batch = self.add_batch(task) ++ else: ++ raise ValueError(f"Unsupported task format: {task}") ++ uuids_all.extend(uuids_batch) ++ return uuids_all ++ ++ def run_task_with_timeout( ++ self, task_id: str, func: Callable, args: Tuple, kwargs: Dict ++ ) -> TaskResult: ++ func_name = self.task_meta[task_id] ++ tag = self.tag_map[task_id] ++ result = None ++ status_code = 0 ++ ++ def target(): ++ nonlocal result, status_code ++ try: ++ result = func(*args, **kwargs) ++ status_code = 0 ++ except Exception as e: ++ result = ExecuteResult( ++ status_code=-1, output="", err_msg=traceback.format_exc() ++ ) ++ status_code = -1 ++ ++ thread = threading.Thread(target=target) ++ thread.start() ++ thread.join(timeout=30) # 设置超时时间为30秒 ++ if thread.is_alive(): ++ result = ExecuteResult( ++ status_code=-1, output="", err_msg="Task timed out after 30 seconds" ++ ) ++ status_code = -1 ++ return TaskResult(task_id, func_name, result, status_code, tag) ++ ++ def run_all_tasks(self) -> None: ++ self.all_results.clear() ++ for task_id, func, args, kwargs in self.tasks: ++ task_result = self.run_task_with_timeout(task_id, func, args, kwargs) ++ self.all_results.append(task_result) ++ ++ def get_all_results(self) -> List[TaskResult]: ++ self.tasks.clear() ++ return self.all_results ++ ++ ++thread_pool_manager = ThreadPoolManager(max_workers=8) ++serial_task_manager = SerialTaskManager() +-- +2.43.0 +