From 406f685650764e91c324384793483793cf75c509 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Wed, 10 Sep 2025 08:52:44 +0000 Subject: [PATCH 1/8] update ci-bot for auto generating translation issue --- .../new_create_translation_issue.py | 72 +- .../new_create_translation_issue.yaml | 27 + ci/tools/translation/translation_agent.py | 1279 +++++++++++++++++ 3 files changed, 1373 insertions(+), 5 deletions(-) mode change 100644 => 100755 ci/tools/translation/new_create_translation_issue.py mode change 100644 => 100755 ci/tools/translation/new_create_translation_issue.yaml create mode 100755 ci/tools/translation/translation_agent.py diff --git a/ci/tools/translation/new_create_translation_issue.py b/ci/tools/translation/new_create_translation_issue.py old mode 100644 new mode 100755 index 5ca3cc0d2..0a61c7136 --- a/ci/tools/translation/new_create_translation_issue.py +++ b/ci/tools/translation/new_create_translation_issue.py @@ -6,6 +6,7 @@ import sys from dataclasses import dataclass, field from difflib import SequenceMatcher from typing import TypeVar, Generic +from translation_agent import get_agent_summary import requests import yaml @@ -39,15 +40,27 @@ class Org: self.issue_triggers = tmp_issue_triggers +@dataclass +class TranslationAgentConfig: + backend: dict = field(default_factory=dict) + model: dict = field(default_factory=dict) + processing: dict = field(default_factory=dict) + logging: dict = field(default_factory=dict) + + @dataclass class Config: orgs: list[dict | Org] + translation_agent: dict | TranslationAgentConfig = field(default_factory=dict) def __post_init__(self): tmp_orgs: list[Org] = [] for item in self.orgs: tmp_orgs.append(Org(**item)) self.orgs = tmp_orgs + + if isinstance(self.translation_agent, dict) and self.translation_agent: + self.translation_agent = TranslationAgentConfig(**self.translation_agent) @dataclass @@ -231,6 +244,8 @@ class Args: pr_owner: str pr_repo: str pr_number: int + siliconflow_api_key: str = "" + siliconflow_api_base: str = "https://api.siliconflow.cn/v1" def validate(self): valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number @@ -249,14 +264,15 @@ def load_config_yaml(yaml_path): def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, - pr_number: int): + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) for org_item in conf.orgs: issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) if org_item.org_name != pr_owner: continue - if org_item.auto_create_issue: - cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) + # 旧标点符号判断逻辑,已弃用 + # if org_item.auto_create_issue: + # cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) file_count = 0 diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) if diff_content is None: @@ -300,18 +316,60 @@ def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_ need_create_issue_titles.append(need_create_issue[issue_item][1]) need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] if need_create_issue_titles: + need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) + if not need_create_issue_list: feedback_comment = "issue has already created, please go to check issue: {}".format( existed_issue_list) logger.info("Warning: " + feedback_comment) cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) for need_create_issue_item in need_create_issue_list: + + issue_summary = get_agent_summary(diff_content, siliconflow_api_key, siliconflow_api_base) + issue_body = "" + if issue_summary and not issue_summary.error: + issue_body += f"## 📊 变更统计\n\n" + issue_body += f"- **总文件数**: {issue_summary.total_files}\n" + issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" + if issue_summary.total_files != issue_summary.processed_files: + # 注意人工审查提醒 + issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" + issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" + if issue_summary.total_summary: + total = issue_summary.total_summary + issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" + issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" + issue_body += f"## 🔍 整体变更摘要\n\n" + issue_body += f"{total.overall_summary}\n\n" + issue_body += f"## ⚠️ 整体潜在影响\n\n" + issue_body += f"{total.overall_potential_impact}\n\n" + if issue_summary.file_summaries: + issue_body += f"## 📝 单文件变更详情\n\n" + for summary in issue_summary.file_summaries: + issue_body += f"### 📁 {summary.file_path}\n\n" + issue_body += f"- **改动类型**: {summary.change_type}\n" + issue_body += f"- **新增行数**: {summary.lines_added}\n" + issue_body += f"- **删除行数**: {summary.lines_deleted}\n" + issue_body += f"- **潜在影响**: {summary.potential_impact}\n" + issue_body += f"- **详细摘要**: {summary.summary}\n\n" + issue_body += "---\n\n" + else: + issue_body += f"## ⚠️ 翻译变更检测\n\n" + issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" + issue_body += f"**变更文件数量**: {len(diff_files)}\n" + issue_body += f"**相关PR**: {pr__html_url}\n\n" + + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" + issue_body += f"## 🔗 相关PR链接\n\n" + issue_body += f"- {pr__html_url}\n" + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, need_create_issue_template[need_create_issue_item], - "### Related PR link \n - {}".format(pr__html_url)) + issue_body) + def main(): @@ -320,6 +378,8 @@ def main(): parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') args = Args() parser.parse_args(args=sys.argv[1:], namespace=args) args.validate() @@ -333,7 +393,9 @@ def main(): pr_owner = args.pr_owner pr_repo = args.pr_repo pr_number = args.pr_number - create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number) + siliconflow_api_key = args.siliconflow_api_key + siliconflow_api_base = args.siliconflow_api_base + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) if __name__ == '__main__': diff --git a/ci/tools/translation/new_create_translation_issue.yaml b/ci/tools/translation/new_create_translation_issue.yaml old mode 100644 new mode 100755 index a58ebcc6e..bc48ab7a2 --- a/ci/tools/translation/new_create_translation_issue.yaml +++ b/ci/tools/translation/new_create_translation_issue.yaml @@ -1,3 +1,30 @@ +# Translation Agent Configuration +translation_agent: + # Backend Configuration + backend: + type: "siliconflow" # Options: "ollama" or "siliconflow" + # siliconflow配置现在通过命令行参数传入 + ollama: + base_url: "http://localhost:11434" + + # Model Configuration + model: + name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others + temperature: 0.1 + max_retry: 5 # For siliconflow backend + max_retry_ollama: 1 # For ollama backend + + # Processing Configuration + processing: + max_workers: 8 # Number of parallel workers for file processing + single_file_timeout: 180 # Timeout for single file summary generation (seconds) + total_summary_timeout: 300 # Timeout for total summary generation (seconds) + + # Logging Configuration + logging: + level: "INFO" + +# Issue Creation Configuration orgs: - org_name: openeuler issue_of_owner: openeuler diff --git a/ci/tools/translation/translation_agent.py b/ci/tools/translation/translation_agent.py new file mode 100755 index 000000000..258826eb5 --- /dev/null +++ b/ci/tools/translation/translation_agent.py @@ -0,0 +1,1279 @@ +import json +import re +import logging +import urllib.parse +from typing import List, Dict, Any, Optional, Tuple, Literal +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError +from pathlib import Path +import tiktoken +import sys +import time +# LangChain imports +from langchain_core.prompts import ChatPromptTemplate, PromptTemplate +from langchain_core.runnables import RunnableLambda, RunnablePassthrough +from pydantic import BaseModel, Field, SecretStr +from langchain_community.llms import Ollama +from langchain_ollama import ChatOllama +from langchain.chains import TransformChain, SequentialChain +from langchain_core.output_parsers import JsonOutputParser +from langchain_openai import ChatOpenAI +import yaml + +# ==================== 配置加载 ==================== + +def load_config(config_file="new_create_translation_issue.yaml"): + """从YAML文件加载配置""" + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + return config.get('translation_agent', {}) + except FileNotFoundError: + print(f"配置文件 {config_file} 不存在") + raise + except yaml.YAMLError as e: + print(f"解析配置文件时发生错误: {e}") + raise + +# 加载配置 +_config = load_config() + +# ==================== 配置常量 ==================== + +BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') +OLLAMA_BASE_URL = _config.get('backend', {}).get('ollama', {}).get('base_url', 'http://localhost:11434') +MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') +MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) +MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) +MODEL_MAX_RETRY_OLLAMA = _config.get('model', {}).get('max_retry_ollama', 1) +PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) +SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) +TOTAL_SUMMARY_TIMEOUT = _config.get('processing', {}).get('total_summary_timeout', 300) +LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') +SILICONFLOW_API_KEY = '' +SILICONFLOW_API_BASE ='' + +# 配置日志 +logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) +logger = logging.getLogger(__name__) + +# ==================== 数据模型定义 ==================== + +class SingleFileSummary(BaseModel): + """单个文件摘要的结构化输出""" + file_path: str = Field(description="文件路径", default="") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + potential_impact: str = Field(description="改动对其他文件潜在的影响") + summary: str = Field(description="改动的详细摘要") + lines_added: int = Field(description="新增行数", default=0) + lines_deleted: int = Field(description="删除行数", default=0) + +class FileChangeInfo(BaseModel): + """文件改动信息""" + file_path: str = Field(description="文件路径") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + lines_changed: int = Field(description="改动行数") + +class TotalSummary(BaseModel): + """总摘要的结构化输出""" + total_files_changed: int = Field(description="总共修改的文件数量", default=0) + total_lines_changed: int = Field(description="总共修改的行数", default=0) + overall_potential_impact: str = Field(description="整体改动对其他文件潜在的影响") + overall_summary: str = Field(description="整体改动的详细摘要") + change_type_list: List[str] = Field(description="所有文件包含的改动种类列表", default=[]) + file_changes: List[FileChangeInfo] = Field(description="每个修改文件的详细信息列表", default=[]) + +@dataclass +class DiffFileInfo: + """单个文件的diff信息""" + file_path: str + diff_content: str + lines_added: int + lines_deleted: int + +@dataclass +class ProcessingResult: + """处理结果""" + file_summaries: List[SingleFileSummary] + total_summary: Optional[TotalSummary] + processed_files: int + total_files: int + error: Optional[str] = None + +# ==================== Token 统计工具 ==================== + +class TokenCounter: + def __init__(self, model_name=MODEL_NAME): + self.model_name = model_name + self.prompt_tokens = 0 + self.completion_tokens = 0 + self.total_tokens = 0 + self.tokenizer = None + self._init_tokenizer() + + def _init_tokenizer(self): + """初始化tokenizer""" + try: + self.tokenizer = tiktoken.encoding_for_model(self.model_name) + except Exception: + try: + self.tokenizer = tiktoken.get_encoding("cl100k_base") + except Exception: + logger.warning("无法初始化tokenizer,将不会计算token数量") + + def _encode(self, text: str) -> List[int]: + """编码文本""" + if not isinstance(text, str): + return [] + if self.tokenizer is None: + # 如果没有tokenizer,使用简单的估算方法 + return [0] * (len(text) // 4) + try: + return self.tokenizer.encode(text) + except Exception as e: + logger.warning(f"编码文本时发生错误: {e}") + # 如果编码失败,使用简单的估算方法 + return [0] * (len(text) // 4) + + def _count_tokens(self, text: str) -> int: + """计算文本的token数量""" + return len(self._encode(text)) + + def count_prompt(self, prompt: str) -> int: + """计算prompt的token数量""" + tokens = self._count_tokens(prompt) + self.prompt_tokens += tokens + self.total_tokens += tokens + return tokens + + def count_completion(self, completion: str) -> int: + """计算completion的token数量""" + tokens = self._count_tokens(completion) + self.completion_tokens += tokens + self.total_tokens += tokens + return tokens + + def get_stats(self): + return { + "prompt_tokens": self.prompt_tokens, + "completion_tokens": self.completion_tokens, + "total_tokens": self.total_tokens + } + +# ==================== 工具函数 ==================== + +class DiffParser: + """Git Diff 解析器""" + + @staticmethod + def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: + """ + 解析git diff内容,提取每个文件的改动信息 + + Args: + diff_content: git diff的原始内容 + + Returns: + 包含文件路径和对应diff内容的列表 + """ + + files = [] + current_file = None + current_diff = [] + + lines = diff_content.strip().split('\n') + + for line in lines: + # 匹配文件路径行 + if line.startswith('diff --git'): + # 保存前一个文件的信息 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + # 提取文件路径 - 改进的解析逻辑 + current_file = DiffParser._extract_file_path(line) + if current_file: + current_diff = [line] + else: + current_diff = [] + elif current_file: + current_diff.append(line) + + # 添加最后一个文件 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + return files + + @staticmethod + def _extract_file_path(diff_line: str) -> Optional[str]: + """ + 从git diff行中提取文件路径,支持包含汉字的文件名 + + Args: + diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" + + Returns: + 提取出的文件路径,如果解析失败则返回None + """ + try: + # 方法1: 处理引号包围的路径(Git对特殊字符的处理) + # 格式: diff --git "a/path/to/file" "b/path/to/file" + quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' + quoted_match = re.match(quoted_pattern, diff_line) + + if quoted_match: + file_path_a = quoted_match.group(1) + file_path_b = quoted_match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法2: 使用正则表达式匹配标准的git diff格式 + # 格式: diff --git a/path/to/file b/path/to/file + pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' + match = re.match(pattern, diff_line) + + if match: + file_path_a = match.group(1) + file_path_b = match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法3: 如果正则匹配失败,尝试更简单的解析 + # 处理可能包含空格和特殊字符的文件名 + if ' a/' in diff_line and ' b/' in diff_line: + # 找到 a/ 和 b/ 的位置 + a_pos = diff_line.find(' a/') + b_pos = diff_line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = diff_line[a_start:b_pos] + else: + return None + else: + # 方法4: 最后的备选方案,简单的字符串分割 + parts = diff_line.split() + if len(parts) >= 3: + a_path = parts[2] + if a_path.startswith('a/'): + file_path = a_path[2:] # 移除'a/'前缀 + else: + return None + else: + return None + + # 处理文件名编码 + return DiffParser._decode_file_path(file_path) + + except Exception as e: + logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") + return None + + @staticmethod + def _decode_file_path(file_path: str) -> str: + """ + 解码文件路径,处理各种编码情况 + + Args: + file_path: 原始文件路径 + + Returns: + 解码后的文件路径 + """ + try: + # 首先尝试URL解码,处理Git编码的文件名 + decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') + + # 处理Git对特殊字符的引号包装 + if decoded_path.startswith('"') and decoded_path.endswith('"'): + decoded_path = decoded_path[1:-1] + # Git使用反斜杠转义,需要处理转义序列 + decoded_path = decoded_path.replace('\\"', '"') + decoded_path = decoded_path.replace('\\\\', '\\') + + # 无论是否有引号包装,都尝试处理八进制编码 + # 检查是否包含八进制转义序列 + if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): + decoded_path = DiffParser._decode_octal_sequences(decoded_path) + + return decoded_path + + except Exception as e: + logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") + return file_path + + @staticmethod + def _decode_octal_sequences(text: str) -> str: + """ + 解码文本中的八进制转义序列 + + Args: + text: 包含八进制转义序列的文本 + + Returns: + 解码后的文本 + """ + try: + # 查找八进制转义序列模式:\xxx + pattern = r'\\([0-7]{3})' + + # 找到所有八进制序列 + matches = list(re.finditer(pattern, text)) + if not matches: + return text + + # 收集所有字节值 + result = "" + last_end = 0 + bytes_buffer = [] + + for i, match in enumerate(matches): + # 添加匹配前的文本 + if match.start() > last_end: + # 如果有缓冲的字节,先处理它们 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + result += text[last_end:match.start()] + + # 处理当前八进制序列 + octal_str = match.group(1) + try: + byte_value = int(octal_str, 8) + bytes_buffer.append(byte_value) + except ValueError: + # 如果转换失败,添加原始字符串 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + result += match.group(0) + + last_end = match.end() + + # 检查是否是最后一个匹配或下一个匹配不连续 + is_last = (i == len(matches) - 1) + is_next_non_consecutive = (not is_last and + matches[i + 1].start() != match.end()) + + if is_last or is_next_non_consecutive: + # 处理缓冲的字节 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + # 添加剩余的文本 + if last_end < len(text): + result += text[last_end:] + + return result + + except Exception as e: + logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") + return text + + @staticmethod + def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: + """创建DiffFileInfo对象""" + diff_content = '\n'.join(diff_lines) + lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) + + return DiffFileInfo( + file_path=file_path, + diff_content=diff_content, + lines_added=lines_added, + lines_deleted=lines_deleted + ) + + @staticmethod + def _count_lines_changed(diff_content: str) -> Tuple[int, int]: + """统计git diff中改动的行数""" + lines_added, lines_deleted = 0, 0 + lines = diff_content.strip().split('\n') + + for line in lines: + # 统计新增行(以+开头,但不是+++) + if line.startswith('+') and not line.startswith('+++'): + lines_added += 1 + # 统计删除行(以-开头,但不是---) + elif line.startswith('-') and not line.startswith('---'): + lines_deleted += 1 + + return lines_added, lines_deleted + +# ==================== LangChain 组件 ==================== + +class LLMFactory: + """LLM工厂类""" + + @staticmethod + def create_chat_llm(model_name: str = None, base_url: str = None): + """创建LLM实例""" + if model_name is None: + model_name = MODEL_NAME + if base_url is None: + base_url = OLLAMA_BASE_URL + + if BACKEND_TYPE == "ollama": + return ChatOllama( + model=model_name, + base_url=base_url, + temperature=MODEL_TEMPERATURE + ) + elif BACKEND_TYPE == "siliconflow": + return ChatOpenAI( + model=model_name, + api_key=SecretStr(SILICONFLOW_API_KEY), + base_url=SILICONFLOW_API_BASE, + temperature=MODEL_TEMPERATURE + ) + else: + raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + + @staticmethod + def create_llm(model_name: str = None, base_url: str = None): + """创建LLM实例""" + if model_name is None: + model_name = MODEL_NAME + if base_url is None: + base_url = OLLAMA_BASE_URL + + if BACKEND_TYPE == "ollama": + return Ollama( + model=model_name, + base_url=base_url, + temperature=MODEL_TEMPERATURE + ) + elif BACKEND_TYPE == "siliconflow": + return ChatOpenAI( + model=model_name, + api_key=SecretStr(SILICONFLOW_API_KEY), + base_url=SILICONFLOW_API_BASE, + temperature=MODEL_TEMPERATURE + ) + else: + raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + +class PromptTemplates: + """提示模板集合""" + + @staticmethod + def get_single_file_prompt() -> ChatPromptTemplate: + """获取单文件分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", f""" +你是一个专业的Git维护专家,擅长总结社区文档的改动,请分析以下git diff中单个文件的改动,并生成结构化的摘要。 + +请仔细分析这个文件的改动,并按照以下要求生成摘要: + +**务必注意:当你对单个文件的所有变更内容从头到尾进行过完整的分析之后,再生成你最终的结论!不要仅根据其中几行的增删改就给出你的结论!** + +1. 改动类型判断(必须选择以下四种之一,请严格按照示例进行判断): + + - "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - "仅涉及标点符号的修改":仅修改了标点符号的增减、删除、变动,几乎不影响理解 + - "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + - "涉及到中英文文本内容的修改":修改了文档内容、命令或代码注释、字符串等文本,需要对内容进行翻译或调整以使得所有语种的人都可以理解 + +**其中,你需要重点对后三种类型的修改进行区分。越靠后,修改类型判定的优先级越高。** +如果修改的内容仅仅为新增了二进制文件、新增了依赖库等其他内容,绝大部分情况都可以归类为"涉及到其他内容的修改"。 +如果修改的内容不涉及中文或英文字符且不涉及代码改动,绝大部分情况都可以归类为"仅涉及标点符号的修改",但一旦存在除了标点符号或文档格式以外的改动,则优先归为其他类别。 +如果修改的内容涉及代码逻辑、函数定义、配置结构、脚本实现等可能产生现实影响的变更,或者对环境部署命令行、内容配置进行了更改或调整,但不需要对内容进行翻译或调整以使得所有语种的人都可以理解,则归类为"涉及到代码内容的修改"。 +如果修改的内容涉及中文或英文字符,且需要对内容进行翻译或调整以使得所有语种的人都可以理解,可以归类为"涉及到中英文文本内容的修改"。 +一个区分"涉及到代码内容的修改"和"涉及到中英文文本内容的修改"的标准是:如果当前的改动属于某一语言,如果使用者不理解该语言,则必须要对改动进行翻译才能理解,则归类为"涉及到中英文文本内容的修改",否则归类为"涉及到代码内容的修改"。 + +下面我将提供几个判断示例供你参考: + +示例1 - 仅涉及标点符号的修改: +```diff +- 这是一个测试文档,用于演示功能。 ++ 这是一个测试文档,用于演示功能! +``` +分析:只变更了逗号为中文逗号,句号为感叹号,属于"仅涉及标点符号的修改" +或者文件中: +```diff +- 这个文档的功能有进一步补充的空间。 ++ 这个文档的功能有进一步补充的空间! +``` +分析:只涉及中文句号和感叹号的增删改,不涉及中文字符和英文字符的改动,且不涉及代码改动,属于"仅涉及标点符号的修改" + +示例2 - 涉及到代码内容的修改: +```diff +- function getUserInfo() ++ function getUserProfile() +``` +或者在文档的代码块中: +```diff +- ```python +- def hello(): +- print("hello") +- ``` ++ ```python ++ def greeting(): ++ print("hello world") ++ ``` +``` +或者在文档的命令行代码块中 +```diff +- pwd +- cat /etc/profile ++ sudo apt update ++ whoami ++ echo "hello" +``` +分析:修改了函数名、逻辑或文档文本中的代码块等,但是不涉及需要翻译的内容,属于"涉及到代码内容的修改" + +示例3 - 涉及到中英文文本内容的修改: +```diff +- // 这是一个注释说明 ++ // 这是一个更详细的注释说明 +``` +或者JSON中: +```diff +- "description": "用户管理模块" ++ "description": "用户账户管理模块" +``` +分析:修改了注释或文档文本内容,影响用户的阅读理解,需要对内容进行翻译或调整以使得所有语种的人都可以理解,属于"涉及到中英文文本内容的修改" + +示例4 - 涉及到其他内容的修改: +```diff ++ Binary file image.png added +``` +或者: +```diff ++ "dependencies": ++ "new-package": "^1.0.0" ++ +``` +分析:新增了二进制文件或依赖包等,属于"涉及到其他内容的修改" + +2. 潜在影响分析: + - 分析这个文件的改动可能对其他文件或整体系统造成的影响 + - 考虑依赖关系、接口变化、数据流等 + - 如果是配置文件的修改,考虑对系统配置的影响 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 详细摘要: + - 提炼出摘要改动文件所属的板块,并解释板块作用 + - 结合文件名和改动细节,用详细的语言描述具体的改动内容,要求准确全面,且改动内容要做到具体 + - 突出重要的改动点和影响范围,包括修改内容主要针对的对象、文档的分类等 + - 结合文件名、改动类型、潜在影响分析,对摘要做进一步补充 + +4. 输出格式: + - 请用中文生成摘要 + - 要求改动类型、潜在影响、改动内容总结都包含在摘要中,不能存在空字段 + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + + """), + ("human", """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} + + """) + ]) + + @staticmethod + def get_total_summary_prompt() -> ChatPromptTemplate: + """获取总摘要生成提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请基于以下各个文件的改动摘要,生成整个git diff的总摘要。 + +请分析所有文件的改动,并生成一个总摘要,要求: + +1. 整体改动类型统计: + - 统计所有文件涉及到的改动类型,取并集 + - 四种改动类型说明: + * "仅涉及标点符号的修改":只修改了标点符号的增减、删除、变动 + * "涉及到中英文文本内容的修改":修改了文档内容、注释等文本,但未涉及代码逻辑 + * "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + * "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - 将所有出现的改动类型都列出,不做优先级选择 + +统计示例: + +示例1 - 单一类型: +文件A:仅涉及标点符号的修改 +文件B:仅涉及标点符号的修改 +→ 整体改动类型:["仅涉及标点符号的修改"] + +示例2 - 多种类型: +文件A:仅涉及标点符号的修改 +文件B:涉及到中英文文本内容的修改 +→ 整体改动类型:["仅涉及标点符号的修改", "涉及到中英文文本内容的修改"] + +示例3 - 复杂混合: +文件A:涉及到中英文文本内容的修改 +文件B:涉及到代码内容的修改 +文件C:涉及到其他内容的修改 +→ 整体改动类型:["涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] + +2. 整体潜在影响分析: + - 逐个总结所有文件的改动内容,并进行详细的列举,尽量涵盖所有修改内容 + - 综合分析所有文件改动对系统的整体影响 + - 考虑文件间的依赖关系和系统架构影响 + - 评估改动的风险等级和影响范围 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 整体摘要详细列举: + - 提炼出所有摘要改动文件所属的板块,并解释板块作用 + - 用详细的语言分条概括每个摘要文件的核心内容,需要具体到文件,这一部分要占到最大的篇幅,不要遗漏任何摘要文件的内容 + - 突出重要的改动点,包括修改内容主要针对的对象、文档的分类等 + - 注意:整体摘要需要总结所有文件的内容;整体摘要需要尽可能详细 + +4. 输出格式: + - 请用中文生成摘要,整体摘要内容字段务必全面详细 + - 要求整体潜在影响、整体摘要都包含在摘要中,不能存在空字段 + - 整体摘要必须满足以下格式:"本次更改涉及到XXX等文件,这些文件分别属于社区中的XXX模块。涉及到XXX的修改,可能会对XXX造成影响。总的来说,这次更改主要是XXX。" + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + + + """), + ("human", """ +各个文件的改动摘要: +{file_changes} + +总文件数: {total_files} + """) + ]) + +class SingleFileAnalysisChain: + """单文件分析任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter): + self.llm = llm + self.token_counter = token_counter + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=SingleFileSummary) + + # 根据后端类型选择不同的链构建方式 + if BACKEND_TYPE == "ollama": + self.prompt = PromptTemplates.get_single_file_prompt() + self.chain = self.prompt | self.llm.with_structured_output(SingleFileSummary) + else: + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "change_type": "改动类型(必须是以下之一:仅涉及标点符号的修改、涉及到中英文文本内容的修改、涉及到代码内容的修改、涉及到其他内容的修改)", + "potential_impact": "改动对其他文件潜在的影响", + "summary": "改动的详细摘要" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请分析以下git diff中单个文件的改动,并生成结构化的摘要。 + +请仔细分析这个文件的改动,并按照以下要求生成摘要: + +**务必注意:当你对单个文件的所有变更内容从头到尾进行过完整的分析之后,再生成你最终的结论!不要仅根据其中几行的增删改就给出你的结论!** + +1. 改动类型判断(必须选择以下四种之一,请严格按照示例进行判断): + + - "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - "仅涉及标点符号的修改":仅修改了标点符号的增减、删除、变动,几乎不影响理解 + - "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + - "涉及到中英文文本内容的修改":修改了文档内容、命令或代码注释、字符串等文本,需要对内容进行翻译或调整以使得所有语种的人都可以理解 + +**其中,你需要重点对后三种类型的修改进行区分。越靠后,修改类型判定的优先级越高。** +如果修改的内容仅仅为新增了二进制文件、新增了依赖库等其他内容,绝大部分情况都可以归类为"涉及到其他内容的修改"。 +如果修改的内容不涉及中文或英文字符且不涉及代码改动,绝大部分情况都可以归类为"仅涉及标点符号的修改",但一旦存在除了标点符号或文档格式以外的改动,则优先归为其他类别。 +如果修改的内容涉及代码逻辑、函数定义、配置结构、脚本实现等可能产生现实影响的变更,或者对环境部署命令行、内容配置进行了更改或调整,但不需要对内容进行翻译或调整以使得所有语种的人都可以理解,则归类为"涉及到代码内容的修改"。 +如果修改的内容涉及中文或英文字符,且需要对内容进行翻译或调整以使得所有语种的人都可以理解,可以归类为"涉及到中英文文本内容的修改"。 +一个区分"涉及到代码内容的修改"和"涉及到中英文文本内容的修改"的标准是:如果当前的改动属于某一语言,如果使用者不理解该语言,则必须要对改动进行翻译才能理解,则归类为"涉及到中英文文本内容的修改",否则归类为"涉及到代码内容的修改"。 + +下面我将提供几个判断示例供你参考: + +示例1 - 仅涉及标点符号的修改: +```diff +- 这是一个测试文档,用于演示功能。 ++ 这是一个测试文档,用于演示功能! +``` +分析:只变更了逗号为中文逗号,句号为感叹号,属于"仅涉及标点符号的修改" +或者文件中: +```diff +- 这个文档的功能有进一步补充的空间。 ++ 这个文档的功能有进一步补充的空间! +``` +分析:只涉及中文句号和感叹号的增删改,不涉及中文字符和英文字符的改动,且不涉及代码改动,属于"仅涉及标点符号的修改" + +示例2 - 涉及到代码内容的修改: +```diff +- function getUserInfo() ++ function getUserProfile() +``` +或者在文档的代码块中: +```diff +- ```python +- def hello(): +- print("hello") +- ``` ++ ```python ++ def greeting(): ++ print("hello world") ++ ``` +``` +或者在文档的命令行代码块中 +```diff +- pwd +- cat /etc/profile ++ sudo apt update ++ whoami ++ echo "hello" +``` +分析:修改了函数名、逻辑或文档文本中的代码块等,但是不涉及需要翻译的内容,属于"涉及到代码内容的修改" + +示例3 - 涉及到中英文文本内容的修改: +```diff +- // 这是一个注释说明 ++ // 这是一个更详细的注释说明 +``` +或者JSON中: +```diff +- "description": "用户管理模块" ++ "description": "用户账户管理模块" +``` +分析:修改了注释或文档文本内容,影响用户的阅读理解,需要对内容进行翻译或调整以使得所有语种的人都可以理解,属于"涉及到中英文文本内容的修改" + +示例4 - 涉及到其他内容的修改: +```diff ++ Binary file image.png added +``` +或者: +```diff ++ "dependencies": ++ "new-package": "^1.0.0" ++ +``` +分析:新增了二进制文件或依赖包等,属于"涉及到其他内容的修改" + +2. 潜在影响分析: + - 分析这个文件的改动可能对其他文件或整体系统造成的影响 + - 考虑依赖关系、接口变化、数据流等 + - 如果是配置文件的修改,考虑对系统配置的影响 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 详细摘要: + - 提炼出摘要改动文件所属的板块,并解释板块作用 + - 结合文件名和改动细节,用详细的语言描述具体的改动内容,要求准确全面,且改动内容要做到具体 + - 突出重要的改动点和影响范围,包括修改内容主要针对的对象、文档的分类等 + - 结合文件名、改动类型、潜在影响分析,对摘要做进一步补充 + +4. 输出格式: + - 请用中文生成摘要 + - 要求改动类型、潜在影响、改动内容总结都包含在摘要中,不能存在空字段 + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + +{format_instructions} +""" + human_template = """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def analyze(self, diff_file_info: DiffFileInfo) -> Optional[SingleFileSummary]: + """分析单个文件的改动""" + max_retry = MODEL_MAX_RETRY_OLLAMA if BACKEND_TYPE == "ollama" else MODEL_MAX_RETRY + for attempt in range(1, max_retry + 1): + # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 + if attempt > 1: + delay = min(attempt * 2, 10) # 递增延迟,最多10秒 + logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") + time.sleep(delay) + + try: + # 构造prompt字符串 + prompt_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + try: + messages = self.prompt.format_messages(**prompt_args) + if messages and len(messages) > 0: + message = messages[0] + if hasattr(message, 'content') and message.content: + prompt_str = str(message.content) + if prompt_str: + self.token_counter.count_prompt(prompt_str) + except Exception as e: + logger.warning(f"格式化prompt时发生错误: {e}") + + # 直接调用,简化超时控制 + invoke_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content, + "lines_added": diff_file_info.lines_added, + "lines_deleted": diff_file_info.lines_deleted + } + if BACKEND_TYPE != "ollama": + invoke_args["response_format"] = {"type": "json_object"} + + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, SingleFileSummary)): + if isinstance(result, dict): + result = SingleFileSummary(**result) + + # 检查结果完整性 + if result and hasattr(result, 'summary') and result.summary and result.change_type: + # 统计completion token + try: + completion_str = str(result.summary) + if completion_str: + self.token_counter.count_completion(completion_str) + except Exception as e: + logger.warning(f"计算completion tokens时发生错误: {e}") + + # 设置准确值 + result.file_path = diff_file_info.file_path + result.lines_added = diff_file_info.lines_added + result.lines_deleted = diff_file_info.lines_deleted + return result + + # 结果无效,记录并重试 + logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") + if attempt < max_retry: + continue + except Exception as e: + err_str = str(e) + # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 + is_http_error = False + for code in ["404", "500", "502", "503", "504"]: + if code in err_str: + is_http_error = True + break + if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): + is_http_error = True + if is_http_error: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") + if attempt < max_retry: + time.sleep(10) + continue + else: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") + # 其它异常直接进入下一次重试 + if attempt < max_retry: + logger.info(f"第{attempt}次尝试失败,准备重试...") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") + return None + +class TotalSummaryChain: + """总摘要生成任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter): + self.llm = llm + self.token_counter = token_counter + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=TotalSummary) + + # 根据后端类型选择不同的链构建方式 + if BACKEND_TYPE == "ollama": + self.prompt = PromptTemplates.get_total_summary_prompt() + self.chain = self.prompt | self.llm.with_structured_output(TotalSummary) + else: + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "overall_potential_impact": "整体改动对其他文件潜在的影响", + "overall_summary": "整体改动的详细摘要" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请基于以下各个文件的改动摘要,生成整个git diff的总摘要。 + +请分析所有文件的改动,并生成一个总摘要,要求: + +1. 整体改动类型统计: + - 统计所有文件涉及到的改动类型,取并集 + - 四种改动类型说明: + * "仅涉及标点符号的修改":只修改了标点符号的增减、删除、变动 + * "涉及到中英文文本内容的修改":修改了文档内容、注释等文本,但未涉及代码逻辑 + * "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + * "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - 将所有出现的改动类型都列出,不做优先级选择 + +统计示例: + +示例1 - 单一类型: +文件A:仅涉及标点符号的修改 +文件B:仅涉及标点符号的修改 +→ 整体改动类型:["仅涉及标点符号的修改"] + +示例2 - 多种类型: +文件A:仅涉及标点符号的修改 +文件B:涉及到中英文文本内容的修改 +→ 整体改动类型:["仅涉及标点符号的修改", "涉及到中英文文本内容的修改"] + +示例3 - 复杂混合: +文件A:涉及到中英文文本内容的修改 +文件B:涉及到代码内容的修改 +文件C:涉及到其他内容的修改 +→ 整体改动类型:["涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] + +2. 整体潜在影响分析: + - 逐个总结所有文件的改动内容,并进行详细的列举,尽量涵盖所有修改内容 + - 综合分析所有文件改动对系统的整体影响 + - 考虑文件间的依赖关系和系统架构影响 + - 评估改动的风险等级和影响范围 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 整体摘要详细列举: + - 提炼出所有摘要改动文件所属的板块,并解释板块作用 + - 用详细的语言分条概括每个摘要文件的核心内容,需要具体到文件,这一部分要占到最大的篇幅,不要遗漏任何摘要文件的内容 + - 突出重要的改动点,包括修改内容主要针对的对象、文档的分类等 + - 注意:整体摘要需要总结所有文件的内容;整体摘要需要尽可能详细 + +4. 输出格式: + - 请用中文生成摘要,整体摘要内容字段务必全面详细 + - 要求整体潜在影响、整体摘要都包含在摘要中,不能存在空字段 + - 整体摘要必须满足以下格式:"本次更改涉及到XXX等文件,这些文件分别属于社区中的XXX模块。涉及到XXX的修改,可能会对XXX造成影响。总的来说,这次更改主要是XXX。" + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + +{format_instructions} +""" + human_template = """ +各个文件的改动摘要: +{file_changes} + +总文件数: {total_files} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def generate(self, file_summaries: List[SingleFileSummary]) -> Optional[TotalSummary]: + """生成总摘要""" + try: + total_files = len(file_summaries) + total_lines = sum(s.lines_added + s.lines_deleted for s in file_summaries) + file_changes_info = [] + # 收集所有改动类型 + all_change_types = list(set(s.change_type for s in file_summaries)) + + for summary in file_summaries: + file_changes_info.append({ + 'file_path': summary.file_path, + 'change_type': summary.change_type, + 'potential_impact': summary.potential_impact, + 'summary': summary.summary + }) + + # 构造prompt字符串 + prompt_args = { + "file_changes": json.dumps(file_changes_info, ensure_ascii=False, indent=2), + "total_files": total_files + } + try: + messages = self.prompt.format_messages(**prompt_args) + if messages and len(messages) > 0: + message = messages[0] + if hasattr(message, 'content') and message.content: + prompt_str = str(message.content) + if prompt_str: + self.token_counter.count_prompt(prompt_str) + except Exception as e: + logger.warning(f"格式化prompt时发生错误: {e}") + + # 使用线程池执行器为总摘要生成添加超时控制 + timeout_executor = None + try: + timeout_executor = ThreadPoolExecutor(max_workers=1) + invoke_args = { + "file_changes": json.dumps(file_changes_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "total_lines": total_lines + } + if BACKEND_TYPE != "ollama": + # 为 SiliconFlow 添加 response_format 参数 + invoke_args["response_format"] = {"type": "json_object"} + + # 提交任务并设置超时 + future = timeout_executor.submit(self.chain.invoke, invoke_args) + try: + result = future.result(timeout=TOTAL_SUMMARY_TIMEOUT) + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"生成总摘要超时({TOTAL_SUMMARY_TIMEOUT}秒),放弃生成总摘要: {type(e).__name__}") + try: + future.cancel() # 尝试取消超时的任务 + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + return None + finally: + # 确保线程池被正确关闭 + if timeout_executor: + try: + timeout_executor.shutdown(wait=False) + except Exception as shutdown_e: + logger.warning(f"关闭总摘要线程池时发生错误: {shutdown_e}") + + # 处理结果 + if isinstance(result, (dict, TotalSummary)): + # 如果是dict(来自JsonOutputParser),转换为TotalSummary + if isinstance(result, dict): + result = TotalSummary(**result) + try: + if result and hasattr(result, 'overall_summary'): + summary = result.overall_summary + if summary: + completion_str = str(summary) + if completion_str: + self.token_counter.count_completion(completion_str) + except Exception as e: + logger.warning(f"计算completion tokens时发生错误: {e}") + return TotalSummary( + total_files_changed=total_files, + total_lines_changed=total_lines, + overall_potential_impact=result.overall_potential_impact, + overall_summary=result.overall_summary, + change_type_list=all_change_types, + file_changes=[ + FileChangeInfo( + file_path=summary.file_path, + change_type=summary.change_type, + lines_changed=summary.lines_added + summary.lines_deleted + ) + for summary in file_summaries + ] + ) + else: + logger.error(f"生成总摘要时返回类型错误: {type(result)}") + return None + except Exception as e: + logger.error(f"生成总摘要时发生错误: {e}") + return None + +# ==================== 主处理类 ==================== + +class GitDiffSummarizer: + """Git Diff 摘要生成器""" + + def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): + if model_name is None: + model_name = MODEL_NAME + if base_url is None: + base_url = OLLAMA_BASE_URL + + # 设置siliconflow API配置 + global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE + if siliconflow_api_key: + SILICONFLOW_API_KEY = siliconflow_api_key + if siliconflow_api_base: + SILICONFLOW_API_BASE = siliconflow_api_base + + self.token_counter = TokenCounter(model_name) + self.llm = LLMFactory.create_chat_llm(model_name, base_url) + self.single_file_chain = SingleFileAnalysisChain(self.llm, self.token_counter) + self.total_summary_chain = TotalSummaryChain(self.llm, self.token_counter) + + def cleanup(self): + """清理资源,确保程序能正确退出""" + try: + # 清理 LLM 连接 + if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): + self.llm.client.close() + elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): + self.llm._client.close() + + # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 + if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): + try: + # 强制关闭 httpx 客户端 + if hasattr(self.llm.client, '_client'): + self.llm.client._client.close() + except Exception as e: + logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") + + logger.info("资源清理完成") + except Exception as e: + logger.warning(f"清理资源时发生错误: {e}") + + def process_git_diff(self, diff_content: str, max_workers: int = None) -> ProcessingResult: + if max_workers is None: + max_workers = PROCESSING_MAX_WORKERS + + logger.info("开始解析git diff...") + files = DiffParser.parse_git_diff(diff_content) + logger.info(f"解析到 {len(files)} 个文件的改动") + if not files: + logger.warning("未找到任何文件改动") + return ProcessingResult( + file_summaries=[], + total_summary=None, + processed_files=0, + total_files=0, + error='未找到任何文件改动' + ) + logger.info("开始并行处理各个文件的改动...") + file_summaries = [] + # 使用更健壮的并发处理机制 + executor = None + try: + executor = ThreadPoolExecutor(max_workers=max_workers) + future_to_file = { + executor.submit(self.single_file_chain.analyze, file_info): file_info.file_path + for file_info in files + } + + # 设置更长的整体超时时间,避免与单个文件超时冲突 + overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 + + completed_count = 0 + total_count = len(future_to_file) + + try: + for future in as_completed(future_to_file, timeout=overall_timeout): + file_path = future_to_file[future] + completed_count += 1 + try: + summary = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 + if summary: + file_summaries.append(summary) + logger.info(f"完成文件 {file_path} 的摘要生成 ({completed_count}/{total_count})") + else: + logger.warning(f"文件 {file_path} 的摘要生成失败 ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"文件 {file_path} 的摘要获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + except Exception as e: + logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as overall_e: + logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") + # 取消所有未完成的任务 + for future in future_to_file: + if not future.done(): + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消未完成任务时发生错误: {cancel_e}") + finally: + # 确保线程池被正确关闭 + if executor: + try: + executor.shutdown(wait=True) + except Exception as shutdown_e: + logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") + logger.info(f"成功生成 {len(file_summaries)} 个文件的摘要") + logger.info("开始生成总摘要...") + total_summary = None + if file_summaries: + logger.info(f"基于 {len(file_summaries)} 个成功处理的文件生成总摘要...") + try: + total_summary = self.total_summary_chain.generate(file_summaries) + if total_summary: + logger.info("总摘要生成成功") + else: + logger.warning("总摘要生成失败") + except Exception as e: + logger.error(f"生成总摘要时发生未预期的错误: {e}") + else: + logger.warning("没有成功处理的文件,跳过总摘要生成") + return ProcessingResult( + file_summaries=file_summaries, + total_summary=total_summary, + processed_files=len(file_summaries), + total_files=len(files) + ) + +# ==================== 主函数 ==================== + +def get_agent_summary(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): + + summarizer = GitDiffSummarizer(siliconflow_api_key, siliconflow_api_base) + result = None + try: + result = summarizer.process_git_diff(sample_diff) + finally: + # 确保在函数退出前清理资源 + summarizer.cleanup() + + if not result: + print("处理失败,无法获取结果") + return None + + if result.error: + print(f"错误: {result.error}") + print("\n=== 单文件摘要 ===") + for summary in result.file_summaries: + print(f"文件: {summary.file_path}") + print(f"改动类型: {summary.change_type}") + print(f"新增行数: {summary.lines_added}") + print(f"删除行数: {summary.lines_deleted}") + print(f"潜在影响: {summary.potential_impact}") + print(f"摘要: {summary.summary}") + print("-" * 50) + print("=== 处理结果 ===") + print(f"总文件数: {result.total_files}") + print(f"成功处理文件数: {result.processed_files}") + if result.total_summary: + print("\n=== 总摘要 ===") + total = result.total_summary + print(f"总文件数: {total.total_files_changed}") + print(f"总改动行数: {total.total_lines_changed}") + print(f"改动类型列表: {total.change_type_list}") + print(f"整体潜在影响: {total.overall_potential_impact}") + print(f"整体摘要: {total.overall_summary}") + print("\n=== 文件改动列表 ===") + for file_change in total.file_changes: + print(f"- {file_change.file_path}: {file_change.change_type} ({file_change.lines_changed} 行)") + + # 输出token统计 + stats = summarizer.token_counter.get_stats() + print("\n=== Token消耗统计 ===") + print(f"Prompt tokens: {stats['prompt_tokens']}") + print(f"Completion tokens: {stats['completion_tokens']}") + print(f"Total tokens: {stats['total_tokens']}") + # exit() + return result + +if __name__ == "__main__": + # 微服务接口逻辑模拟: 传递进来的就是 sample_diff 的内容 + sample_diff = sys.argv[1] + result = get_agent_summary(sample_diff) + print(result) \ No newline at end of file -- Gitee From 7cdf3c857e07c1b9e807710e45a5974afeab8f83 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Sat, 20 Sep 2025 20:32:48 +0800 Subject: [PATCH 2/8] update ci-bot for auto generating translation comment --- ci/tools/comment/comment_agent.py | 958 +++++++++++++++++++++++++++ ci/tools/comment/create_comment.py | 372 +++++++++++ ci/tools/comment/create_comment.yaml | 38 ++ 3 files changed, 1368 insertions(+) create mode 100644 ci/tools/comment/comment_agent.py create mode 100644 ci/tools/comment/create_comment.py create mode 100644 ci/tools/comment/create_comment.yaml diff --git a/ci/tools/comment/comment_agent.py b/ci/tools/comment/comment_agent.py new file mode 100644 index 000000000..25dbe385c --- /dev/null +++ b/ci/tools/comment/comment_agent.py @@ -0,0 +1,958 @@ +import json +import re +import logging +import urllib.parse +from typing import List, Dict, Any, Optional, Tuple, Literal +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError +from pathlib import Path +import sys +import time +# LangChain imports +from langchain_core.prompts import ChatPromptTemplate, PromptTemplate +from langchain_core.runnables import RunnableLambda, RunnablePassthrough +from pydantic import BaseModel, Field, SecretStr +from langchain_community.llms import Ollama +from langchain_ollama import ChatOllama +from langchain.chains import TransformChain, SequentialChain +from langchain_core.output_parsers import JsonOutputParser +from langchain_openai import ChatOpenAI +import yaml + +# ==================== 配置加载 ==================== + +def load_config(config_file="create_comment.yaml"): + """从YAML文件加载配置""" + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + return config.get('comment_agent', {}) + except FileNotFoundError: + print(f"配置文件 {config_file} 不存在") + raise + except yaml.YAMLError as e: + print(f"解析配置文件时发生错误: {e}") + raise + +# 加载配置 +_config = load_config() + +# ==================== 配置常量 ==================== + +BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') +MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') +MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) +MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) +PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) +SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) +TOTAL_COMMENT_TIMEOUT = _config.get('processing', {}).get('total_comment_timeout', 300) +LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') +SILICONFLOW_API_KEY = '' +SILICONFLOW_API_BASE = '' + +# 配置日志 +logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) +logger = logging.getLogger(__name__) + +# ==================== 数据模型定义 ==================== + +class PRAnalysisResult(BaseModel): + """PR分析结果的结构化输出""" + has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) + text_change_type: Literal["无文本改动", "仅标点符号改动", "英文内容改动", "代码注释改动", "混合改动"] = Field(description="文本改动类型") + has_grammar_errors: bool = Field(description="是否存在语法语病错误", default=False) + grammar_errors: List[str] = Field(description="具体的语法语病错误列表", default=[]) + detailed_analysis: str = Field(description="详细分析说明") + suggestions: List[str] = Field(description="改进建议列表", default=[]) + +class FileTextAnalysis(BaseModel): + """单个文件的文本分析""" + file_path: str = Field(description="文件路径", default="") + has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) + text_lines: List[str] = Field(description="涉及文本改动的行", default=[]) + grammar_issues: List[str] = Field(description="语法问题列表", default=[]) + analysis_details: str = Field(description="分析详情") + +@dataclass +class DiffFileInfo: + """单个文件的diff信息""" + file_path: str + diff_content: str + lines_added: int + lines_deleted: int + +@dataclass +class CommentResult: + """评论生成结果""" + pr_analysis: Optional[PRAnalysisResult] + file_analyses: List[FileTextAnalysis] + processed_files: int + total_files: int + error: Optional[str] = None + +# ==================== Token 统计工具 ==================== + + +# ==================== 工具函数 ==================== + +class DiffParser: + """Git Diff 解析器""" + + @staticmethod + def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: + """ + 解析git diff内容,提取每个文件的改动信息 + + Args: + diff_content: git diff的原始内容 + + Returns: + 包含文件路径和对应diff内容的列表 + """ + + files = [] + current_file = None + current_diff = [] + + lines = diff_content.strip().split('\n') + + for line in lines: + # 匹配文件路径行 + if line.startswith('diff --git'): + # 保存前一个文件的信息 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + # 提取文件路径 - 改进的解析逻辑 + current_file = DiffParser._extract_file_path(line) + if current_file: + current_diff = [line] + else: + current_diff = [] + elif current_file: + current_diff.append(line) + + # 添加最后一个文件 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + return files + + @staticmethod + def _extract_file_path(diff_line: str) -> Optional[str]: + """ + 从git diff行中提取文件路径,支持包含汉字的文件名 + + Args: + diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" + + Returns: + 提取出的文件路径,如果解析失败则返回None + """ + try: + # 方法1: 处理引号包围的路径(Git对特殊字符的处理) + # 格式: diff --git "a/path/to/file" "b/path/to/file" + quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' + quoted_match = re.match(quoted_pattern, diff_line) + + if quoted_match: + file_path_a = quoted_match.group(1) + file_path_b = quoted_match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法2: 使用正则表达式匹配标准的git diff格式 + # 格式: diff --git a/path/to/file b/path/to/file + pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' + match = re.match(pattern, diff_line) + + if match: + file_path_a = match.group(1) + file_path_b = match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法3: 如果正则匹配失败,尝试更简单的解析 + # 处理可能包含空格和特殊字符的文件名 + if ' a/' in diff_line and ' b/' in diff_line: + # 找到 a/ 和 b/ 的位置 + a_pos = diff_line.find(' a/') + b_pos = diff_line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = diff_line[a_start:b_pos] + else: + return None + else: + # 方法4: 最后的备选方案,简单的字符串分割 + parts = diff_line.split() + if len(parts) >= 3: + a_path = parts[2] + if a_path.startswith('a/'): + file_path = a_path[2:] # 移除'a/'前缀 + else: + return None + else: + return None + + # 处理文件名编码 + return DiffParser._decode_file_path(file_path) + + except Exception as e: + logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") + return None + + @staticmethod + def _decode_file_path(file_path: str) -> str: + """ + 解码文件路径,处理各种编码情况 + + Args: + file_path: 原始文件路径 + + Returns: + 解码后的文件路径 + """ + try: + # 首先尝试URL解码,处理Git编码的文件名 + decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') + + # 处理Git对特殊字符的引号包装 + if decoded_path.startswith('"') and decoded_path.endswith('"'): + decoded_path = decoded_path[1:-1] + # Git使用反斜杠转义,需要处理转义序列 + decoded_path = decoded_path.replace('\\"', '"') + decoded_path = decoded_path.replace('\\\\', '\\') + + # 无论是否有引号包装,都尝试处理八进制编码 + # 检查是否包含八进制转义序列 + if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): + decoded_path = DiffParser._decode_octal_sequences(decoded_path) + + return decoded_path + + except Exception as e: + logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") + return file_path + + @staticmethod + def _decode_octal_sequences(text: str) -> str: + """ + 解码文本中的八进制转义序列 + + Args: + text: 包含八进制转义序列的文本 + + Returns: + 解码后的文本 + """ + try: + # 查找八进制转义序列模式:\xxx + pattern = r'\\([0-7]{3})' + + # 找到所有八进制序列 + matches = list(re.finditer(pattern, text)) + if not matches: + return text + + # 收集所有字节值 + result = "" + last_end = 0 + bytes_buffer = [] + + for i, match in enumerate(matches): + # 添加匹配前的文本 + if match.start() > last_end: + # 如果有缓冲的字节,先处理它们 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + result += text[last_end:match.start()] + + # 处理当前八进制序列 + octal_str = match.group(1) + try: + byte_value = int(octal_str, 8) + bytes_buffer.append(byte_value) + except ValueError: + # 如果转换失败,添加原始字符串 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + result += match.group(0) + + last_end = match.end() + + # 检查是否是最后一个匹配或下一个匹配不连续 + is_last = (i == len(matches) - 1) + is_next_non_consecutive = (not is_last and + matches[i + 1].start() != match.end()) + + if is_last or is_next_non_consecutive: + # 处理缓冲的字节 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + # 添加剩余的文本 + if last_end < len(text): + result += text[last_end:] + + return result + + except Exception as e: + logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") + return text + + @staticmethod + def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: + """创建DiffFileInfo对象""" + diff_content = '\n'.join(diff_lines) + lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) + + return DiffFileInfo( + file_path=file_path, + diff_content=diff_content, + lines_added=lines_added, + lines_deleted=lines_deleted + ) + + @staticmethod + def _count_lines_changed(diff_content: str) -> Tuple[int, int]: + """统计git diff中改动的行数""" + lines_added, lines_deleted = 0, 0 + lines = diff_content.strip().split('\n') + + for line in lines: + # 统计新增行(以+开头,但不是+++) + if line.startswith('+') and not line.startswith('+++'): + lines_added += 1 + # 统计删除行(以-开头,但不是---) + elif line.startswith('-') and not line.startswith('---'): + lines_deleted += 1 + + return lines_added, lines_deleted + +# ==================== LangChain 组件 ==================== + +class LLMFactory: + """LLM工厂类""" + + @staticmethod + def create_chat_llm(model_name: str = None, base_url: str = None): + """创建LLM实例""" + if model_name is None: + model_name = MODEL_NAME + + if BACKEND_TYPE == "siliconflow": + return ChatOpenAI( + model=model_name, + api_key=SecretStr(SILICONFLOW_API_KEY), + base_url=SILICONFLOW_API_BASE, + temperature=MODEL_TEMPERATURE + ) + else: + raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + +class PromptTemplates: + """提示模板集合""" + + @staticmethod + def get_file_text_analysis_prompt() -> ChatPromptTemplate: + """获取单文件文本分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", f""" +你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。请忽略中文、格式和代码的审计,专注于识别英文文本变更。 + +注意:请忽略中文、格式和代码的审计,专注于识别英文文本变更。如果文档的变更不涉及英文文本,你只需要输出“不涉及英文改动”即可,不需要额外输出任何分析结果。 +同时:对于专有名词,例如openEuler、GitHub等,你不能将其纳入英文文本变更的纠错范围内,而是应该自动识别专有名词。对于代码的相关变更,也不应该纳入分析内容范围。 + +你需要遵循**能不提修改意见就不提修改意见**的原则进行审查!!! + +请仔细分析这个文件的改动,并按照以下要求进行分析: + +**分析重点:** + +1. 英文文本变更识别: + - 检查是否涉及英文文本内容的改动 + - 区分代码逻辑变更和英文文本内容变更 + - 识别注释、文档字符串、用户显示文本等英文文本内容 + - 标识出具体的英文文本变更行 + +2. 语法错误检测: + - 检查英文文本的语法、拼写错误 + +**分析类型判断:** +- 如果改动不涉及任何英文文本内容,标记为"无英文文本改动" +- 如果涉及代码注释的英文文本变更,标记为"代码注释改动" +- 如果涉及文档、界面文本等英文内容变更,标记为"英文内容改动" + +**语法检查重点:** +- 英文:主谓一致、时态、拼写、标点、语序 + +**输出要求:** +- 如果存在英文文本变更但变更不存在语法问题,则直接输出“不存在语法问题”,不需要任何额外输出 +- 详细列出发现的语法错误(如果有) +- 不能超过100个汉字字符 + + """), + ("human", """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} + + """) + ]) + + @staticmethod + def get_pr_analysis_prompt() -> ChatPromptTemplate: + """获取整体PR分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", """ +你是一个专业的PR审查专家,专门分析Gitee文档仓库的翻译PR中的英文文本变更和语法问题。每条PR都是人工生成的文档改动。 + +请分析所有文件的改动,并生成一个综合评估,要求: + +1. 整体文本变更评估: + - 统计涉及文本变更的文件数量 + - 分析文本变更的类型分布 + - 评估变更的重要性和影响范围 + - 如果文本变更不涉及英文,或涉及英文但使用正确不需要改动,则**直接忽略**,无需对其进行总结 + +2. 语法错误汇总: + - **仅汇总改动中的硬伤,如单词拼写错误、英语语法(时态语态)错误等** + - **对于一些可以优化但称不上错误的点,以最小化改动为原则,选择忽略** + - 提高报错阈值,忽略可优化翻译的点 + - 提供优先修复建议 + +3. 质量评估: + - 对整个PR的文本质量给出评分 + - 分析文本变更的一致性 + - 评估对用户体验的影响 + +4. 改进建议: + - 提供具体的修改建议 + - 推荐最佳实践 + - 建议后续的质量控制措施 + +**输出格式要求:** +- 提供清晰的分析结论 +- 按优先级排列发现的问题 +- 给出可操作的改进建议 + + """), + ("human", """ +各个文件的分析结果: +{file_analyses} + +总文件数: {total_files} +涉及文本变更的文件数: {text_changed_files} + """) + ]) + +class FileTextAnalysisChain: + """单文件文本分析任务链""" + + def __init__(self, llm: ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=FileTextAnalysis) + + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "has_text_changes": "是否涉及英文文本改动(布尔值)", + "text_lines": "涉及文本改动的行(字符串列表)", + "grammar_issues": "语法问题列表(字符串列表)", + "analysis_details": "分析详情(字符串)" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。 + +**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** + +**严格过滤条件:** +1. 如果文档的变更不涉及英文文本,直接标记为"无英文文本改动",无需任何分析 +2. 如果涉及英文文本但语法完全正确,直接标记为"语法正确,无需关注" +3. 如果仅涉及标点符号的微小调整,直接标记为"仅标点符号改动,无需关注" +4. 对于专有名词(如openEuler、GitHub等),自动识别并忽略,不纳入纠错范围 +5. 对于代码相关变更,不纳入分析内容范围 + +**只关注以下明显错误:** +- 明显的单词拼写错误(如:recieve -> receive) +- 严重的语法错误(如:主谓不一致、时态错误) +- 明显的标点符号错误(如:缺少句号、逗号使用错误) +- 明显的语序错误 + +**忽略以下情况:** +- 语法正确但可以优化的表达 +- 风格偏好问题 +- 轻微的标点符号调整 +- 术语选择的差异 +- 表达方式的个人偏好 + +**输出要求:** +- 如果不存在明显错误,直接输出"语法正确,无需关注" +- 只有发现明显错误时才详细列出 +- 不能超过100个汉字字符 +- 遵循"能不提修改意见就不提修改意见"的原则 + +{format_instructions} +""" + human_template = """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def analyze(self, diff_file_info: DiffFileInfo) -> Optional[FileTextAnalysis]: + """分析单个文件的文本变更""" + max_retry = MODEL_MAX_RETRY + for attempt in range(1, max_retry + 1): + # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 + if attempt > 1: + delay = min(attempt * 2, 10) # 递增延迟,最多10秒 + logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") + time.sleep(delay) + + try: + # 构造prompt字符串 + prompt_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + + # 直接调用,简化超时控制 + invoke_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, FileTextAnalysis)): + if isinstance(result, dict): + result = FileTextAnalysis(**result) + + # 检查结果完整性 + if result and hasattr(result, 'analysis_details') and result.analysis_details: + + # 设置准确值 + result.file_path = diff_file_info.file_path + + # 检查是否只关注明显错误 + analysis_text = result.analysis_details.lower() + if any(phrase in analysis_text for phrase in [ + "语法正确,无需关注", + "无英文文本改动", + "仅标点符号改动,无需关注", + "不存在语法问题" + ]): + # 如果无问题,设置has_text_changes为False + result.has_text_changes = False + result.grammar_issues = [] + + return result + + # 结果无效,记录并重试 + logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") + if attempt < max_retry: + continue + except Exception as e: + err_str = str(e) + # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 + is_http_error = False + for code in ["404", "500", "502", "503", "504"]: + if code in err_str: + is_http_error = True + break + if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): + is_http_error = True + if is_http_error: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") + if attempt < max_retry: + time.sleep(10) + continue + else: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") + # 其它异常直接进入下一次重试 + if attempt < max_retry: + logger.info(f"第{attempt}次尝试失败,准备重试...") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") + return None + +class PRAnalysisChain: + """PR整体分析任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=PRAnalysisResult) + + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "has_text_changes": "是否涉及英文文本改动(布尔值)", + "text_change_type": "文本改动类型(字符串)", + "has_grammar_errors": "是否存在语法语病错误(布尔值)", + "grammar_errors": "具体的语法语病错误列表(字符串列表)", + "detailed_analysis": "详细分析说明(字符串)", + "suggestions": "改进建议列表(字符串列表)" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的PR审查专家,专门分析Pull Request中的文本变更和语法问题。 + +**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** + +请基于各个文件的分析结果,生成整个PR的综合评估,要求: + +1. 严格过滤文件: + - 只统计存在明显错误的文件 + - 忽略"语法正确,无需关注"的文件 + - 忽略"无英文文本改动"的文件 + - 忽略"仅标点符号改动,无需关注"的文件 + +2. 只汇总明显错误: + - 仅汇总硬伤:明显的单词拼写错误、严重的语法错误 + - 忽略可优化但称不上错误的点 + - 忽略风格偏好问题 + - 忽略轻微的标点符号调整 + +3. 质量评估: + - 只对存在明显错误的文件进行质量评估 + - 如果所有文件都无问题,直接标记为"无问题文件" + +4. 改进建议: + - 只对存在明显错误的文件提供修改建议 + - 建议优先修复明显的拼写和语法错误 + +**输出格式要求:** +- 如果所有文件都无问题,直接输出"所有文件语法正确,无需关注" +- 只列出存在明显错误的文件 +- 按优先级排列发现的问题 +- 给出可操作的改进建议 + +{format_instructions} +""" + human_template = """ +各个文件的分析结果: +{file_analyses} + +总文件数: {total_files} +涉及文本变更的文件数: {text_changed_files} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def generate(self, file_analyses: List[FileTextAnalysis]) -> Optional[PRAnalysisResult]: + """生成PR整体分析""" + try: + total_files = len(file_analyses) + + # 过滤出只关注存在明显错误的文件 + problematic_files = [] + for analysis in file_analyses: + # 检查是否存在明显错误 + has_obvious_errors = ( + analysis.has_text_changes and + analysis.grammar_issues and + len(analysis.grammar_issues) > 0 and + analysis.analysis_details and + not any(phrase in analysis.analysis_details for phrase in [ + "语法正确,无需关注", + "无英文文本改动", + "仅标点符号改动,无需关注", + "不存在语法问题" + ]) + ) + + if has_obvious_errors: + problematic_files.append(analysis) + + # 如果所有文件都无问题,直接返回无问题结果 + if not problematic_files: + return PRAnalysisResult( + has_text_changes=False, + text_change_type="无文本改动", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="所有文件语法正确,无需关注", + suggestions=[] + ) + + text_changed_files = len(problematic_files) + + file_analyses_info = [] + for analysis in problematic_files: + file_analyses_info.append({ + 'file_path': analysis.file_path, + 'has_text_changes': analysis.has_text_changes, + 'text_lines': analysis.text_lines, + 'grammar_issues': analysis.grammar_issues, + 'analysis_details': analysis.analysis_details + }) + + # 构造prompt字符串 + prompt_args = { + "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "text_changed_files": text_changed_files + } + + # 使用线程池执行器为PR分析添加超时控制 + timeout_executor = None + try: + timeout_executor = ThreadPoolExecutor(max_workers=1) + invoke_args = { + "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "text_changed_files": text_changed_files + } + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, PRAnalysisResult)): + # 如果是dict(来自JsonOutputParser),转换为PRAnalysisResult + if isinstance(result, dict): + result = PRAnalysisResult(**result) + return result + else: + logger.error(f"生成PR分析时返回类型错误: {type(result)}") + return None + except Exception as e: + logger.error(f"生成PR分析时发生错误: {e}") + return None + except Exception as e: + logger.error(f"生成PR分析时发生错误: {e}") + return None + +# ==================== 主处理类 ==================== + +class PRCommentAnalyzer: + """PR评论分析器""" + + def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): + if model_name is None: + model_name = MODEL_NAME + + # 设置siliconflow API配置 + global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE + if siliconflow_api_key: + SILICONFLOW_API_KEY = siliconflow_api_key + if siliconflow_api_base: + SILICONFLOW_API_BASE = siliconflow_api_base + + self.llm = LLMFactory.create_chat_llm(model_name) + self.file_analysis_chain = FileTextAnalysisChain(self.llm) + self.pr_analysis_chain = PRAnalysisChain(self.llm) + + def cleanup(self): + """清理资源,确保程序能正确退出""" + try: + # 清理 LLM 连接 + if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): + self.llm.client.close() + elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): + self.llm._client.close() + + # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 + if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): + try: + # 强制关闭 httpx 客户端 + if hasattr(self.llm.client, '_client'): + self.llm.client._client.close() + except Exception as e: + logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") + + logger.info("资源清理完成") + except Exception as e: + logger.warning(f"清理资源时发生错误: {e}") + + def analyze_pr_diff(self, diff_content: str, max_workers: int = None) -> CommentResult: + if max_workers is None: + max_workers = PROCESSING_MAX_WORKERS + + logger.info("开始解析PR diff...") + files = DiffParser.parse_git_diff(diff_content) + logger.info(f"解析到 {len(files)} 个文件的改动") + if not files: + logger.warning("未找到任何文件改动") + return CommentResult( + pr_analysis=None, + file_analyses=[], + processed_files=0, + total_files=0, + error='未找到任何文件改动' + ) + + logger.info("开始并行处理各个文件的文本分析...") + file_analyses = [] + # 使用更健壮的并发处理机制 + executor = None + try: + executor = ThreadPoolExecutor(max_workers=max_workers) + future_to_file = { + executor.submit(self.file_analysis_chain.analyze, file_info): file_info.file_path + for file_info in files + } + + # 设置更长的整体超时时间,避免与单个文件超时冲突 + overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 + + completed_count = 0 + total_count = len(future_to_file) + + try: + for future in as_completed(future_to_file, timeout=overall_timeout): + file_path = future_to_file[future] + completed_count += 1 + try: + analysis = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 + if analysis: + file_analyses.append(analysis) + logger.info(f"完成文件 {file_path} 的文本分析 ({completed_count}/{total_count})") + else: + logger.warning(f"文件 {file_path} 的文本分析失败 ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"文件 {file_path} 的文本分析获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + except Exception as e: + logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as overall_e: + logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") + # 取消所有未完成的任务 + for future in future_to_file: + if not future.done(): + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消未完成任务时发生错误: {cancel_e}") + finally: + # 确保线程池被正确关闭 + if executor: + try: + executor.shutdown(wait=True) + except Exception as shutdown_e: + logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") + + logger.info(f"成功生成 {len(file_analyses)} 个文件的文本分析") + logger.info("开始生成PR整体分析...") + pr_analysis = None + if file_analyses: + logger.info(f"基于 {len(file_analyses)} 个成功处理的文件生成PR分析...") + try: + pr_analysis = self.pr_analysis_chain.generate(file_analyses) + if pr_analysis: + logger.info("PR整体分析生成成功") + else: + logger.warning("PR整体分析生成失败") + except Exception as e: + logger.error(f"生成PR分析时发生未预期的错误: {e}") + else: + logger.warning("没有成功处理的文件,跳过PR分析生成") + + return CommentResult( + pr_analysis=pr_analysis, + file_analyses=file_analyses, + processed_files=len(file_analyses), + total_files=len(files) + ) + +# ==================== 主函数 ==================== + +def get_comment_analysis(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): + + analyzer = PRCommentAnalyzer(siliconflow_api_key, siliconflow_api_base) + result = None + try: + result = analyzer.analyze_pr_diff(sample_diff) + finally: + # 确保在函数退出前清理资源 + analyzer.cleanup() + + if not result: + print("处理失败,无法获取结果") + return None + + if result.error: + print(f"错误: {result.error}") + + print("\n=== 单文件文本分析 ===") + problematic_files = [f for f in result.file_analyses if f.has_text_changes and f.grammar_issues] + if problematic_files: + for analysis in problematic_files: + print(f"文件: {analysis.file_path}") + print(f"涉及文本变更: {analysis.has_text_changes}") + print(f"文本变更行: {analysis.text_lines}") + print(f"语法问题: {analysis.grammar_issues}") + print(f"分析详情: {analysis.analysis_details}") + print("-" * 50) + else: + print("所有文件语法正确,无需关注") + + print("=== 处理结果 ===") + print(f"总文件数: {result.total_files}") + print(f"成功处理文件数: {result.processed_files}") + + if result.pr_analysis: + print("\n=== PR整体分析 ===") + pr = result.pr_analysis + print(f"涉及文本变更: {pr.has_text_changes}") + print(f"文本变更类型: {pr.text_change_type}") + print(f"存在语法错误: {pr.has_grammar_errors}") + print(f"语法错误列表: {pr.grammar_errors}") + print(f"详细分析: {pr.detailed_analysis}") + print(f"改进建议: {pr.suggestions}") + + + return result + +if __name__ == "__main__": + # 微服务接口逻辑: 传递进来的就是 sample_diff 的内容 + sample_diff = sys.argv[1] + result = get_comment_analysis(sample_diff) + print(result) diff --git a/ci/tools/comment/create_comment.py b/ci/tools/comment/create_comment.py new file mode 100644 index 000000000..481c7a2f5 --- /dev/null +++ b/ci/tools/comment/create_comment.py @@ -0,0 +1,372 @@ +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic +from comment_agent import get_comment_analysis + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class Org: + org_name: str + comment_target_owner: str + comment_target_repo: str + auto_comment_enabled: bool = field(default=True) + confidence_threshold: float = field(default=0.7) + text_check_enabled: bool = field(default=True) + grammar_check_enabled: bool = field(default=True) + + +@dataclass +class CommentAgentConfig: + backend: dict = field(default_factory=dict) + model: dict = field(default_factory=dict) + processing: dict = field(default_factory=dict) + logging: dict = field(default_factory=dict) + + +@dataclass +class Config: + orgs: list[dict | Org] + comment_agent: dict | CommentAgentConfig = field(default_factory=dict) + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + if isinstance(self.comment_agent, dict) and self.comment_agent: + self.comment_agent = CommentAgentConfig(**self.comment_agent) + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') +content_type_is_text = "text/plain" +content_type_is_json_dict = {} +content_type_is_json_list = [] + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### 🤖 AI审查反馈 \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is not None + + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def generate_comment_content(comment_result, pr_url: str, analysis_status: str = "success") -> str: + """根据分析结果生成评论内容""" + comment_body = "" + + # 根据分析状态添加不同的状态标识 + if analysis_status == "error": + comment_body += "### 分析状态:处理失败\n" + comment_body += "**分析过程中发生错误,无法生成详细反馈。请手动审查文本变更。**\n\n" + elif analysis_status == "low_confidence": + comment_body += "### 分析状态:置信度较低\n" + comment_body += "**当前分析置信度较低,结果仅供参考。建议进行人工审查。**\n\n" + elif analysis_status == "no_text_changes": + comment_body += "### 分析状态:无文本问题\n" + comment_body += "**AI分析结果显示本次PR未发现明显的文本变更或语法问题。无需改动。**\n\n" + elif analysis_status == "no_grammar_errors": + comment_body += "### 分析状态:文本质量良好\n" + comment_body += "**检测到文本变更,但未发现明显的语法错误,文本质量良好。无需改动。**\n\n" + else: # success with issues + comment_body += "### 分析状态:发现需要关注的问题\n" + comment_body += "**AI分析发现了一些文本变更或语法问题,请查看下方详细信息。**\n\n" + + # 如果有分析结果,添加详细信息 + if comment_result and not comment_result.error: + # 如果有PR整体分析 + if comment_result.pr_analysis: + pr_analysis = comment_result.pr_analysis + + # 添加整体评估摘要 + comment_body += "## 整体评估\n" + comment_body += f"- 涉及文本变更: {'是' if pr_analysis.has_text_changes else '否'}\n" + comment_body += f"- 文本变更类型: {pr_analysis.text_change_type}\n" + comment_body += f"- 存在语法错误: {'是' if pr_analysis.has_grammar_errors else '否'}\n\n" + + # 添加详细分析 + if pr_analysis.detailed_analysis: + comment_body += "## 详细分析\n" + comment_body += f"{pr_analysis.detailed_analysis}\n\n" + + # 添加语法错误列表 + if pr_analysis.grammar_errors: + comment_body += "## 语法问题\n" + for i, error in enumerate(pr_analysis.grammar_errors, 1): + comment_body += f"{i}. {error}\n" + comment_body += "\n" + + # 添加改进建议 + if pr_analysis.suggestions: + comment_body += "## 改进建议\n" + for i, suggestion in enumerate(pr_analysis.suggestions, 1): + comment_body += f"{i}. {suggestion}\n" + comment_body += "\n" + + # 添加文件级别的分析结果 + if comment_result.file_analyses: + # comment_body += "## 文件分析\n" + + # 统计有问题的文件 + files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] + files_without_issues = [f for f in comment_result.file_analyses if not f.has_text_changes and not f.grammar_issues] + + if files_with_issues: + comment_body += f"### 需要关注的文件 ({len(files_with_issues)} 个)\n" + for i, file_analysis in enumerate(files_with_issues, 1): + comment_body += f"\n**{i}. {file_analysis.file_path}**\n" + + if file_analysis.has_text_changes: + comment_body += f"- 文本变更: 检测到英文文本改动\n" + if file_analysis.text_lines: + comment_body += f"- 涉及行数: {len(file_analysis.text_lines)} 行\n" + + if file_analysis.grammar_issues: + comment_body += f"- 语法问题: 发现 {len(file_analysis.grammar_issues)} 个问题\n" + for j, issue in enumerate(file_analysis.grammar_issues, 1): + comment_body += f" {j}. {issue}\n" + + if file_analysis.analysis_details: + comment_body += f"- 分析详情: {file_analysis.analysis_details}\n" + + if files_without_issues: + comment_body += f"\n### 无问题的文件 ({len(files_without_issues)} 个)\n" + for file_analysis in files_without_issues: + comment_body += f"- {file_analysis.file_path}\n" + + # 添加处理统计 + # comment_body += f"\n### 处理统计\n" + # comment_body += f"- 总文件数: {comment_result.total_files}\n" + # comment_body += f"- 成功分析: {comment_result.processed_files}\n" + # comment_body += f"- 有文本变更: {len([f for f in comment_result.file_analyses if f.has_text_changes])}\n" + # comment_body += f"- 有语法问题: {len([f for f in comment_result.file_analyses if f.grammar_issues])}\n" + + # 添加免责声明 + comment_body += "## 免责声明\n" + comment_body += "本评论内容基于AI Agent技术自动生成,仅供参考。请开发者根据实际情况进行判断和修改。\n" + + return comment_body + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + siliconflow_api_key: str = "" + siliconflow_api_base: str = "https://api.siliconflow.cn/v1" + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def create_comment_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): + pr_html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + + for org_item in conf.orgs: + if org_item.org_name != pr_owner: + continue + + if not org_item.auto_comment_enabled: + logger.info(f"组织 {org_item.org_name} 未启用自动评论功能") + continue + + # 移除文件筛选逻辑,对所有PR平等处理 + logger.info("开始对PR进行全面文本分析(不限制文件类型和路径)") + + # 获取diff内容 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + logger.error("无法获取PR的diff内容") + sys.exit(1) + + # 调用AI Agent进行分析 + logger.info("开始进行AI代码审查分析...") + comment_result = get_comment_analysis(diff_content, siliconflow_api_key, siliconflow_api_base) + + if not comment_result: + logger.error("AI分析失败,将发布错误状态评论") + # 创建一个错误结果对象,确保能发布评论 + from comment_agent import CommentResult + comment_result = CommentResult( + pr_analysis=None, + file_analyses=[], + processed_files=0, + total_files=0, + error="AI分析过程失败" + ) + + # 确定分析状态和评论内容 + analysis_status = "success" + + if comment_result.error: + analysis_status = "error" + logger.info("AI分析过程出错,将发布错误状态评论") + elif comment_result.pr_analysis: + pr_analysis = comment_result.pr_analysis + + # 检查是否有文本变更或语法错误 + if pr_analysis.has_text_changes and pr_analysis.has_grammar_errors: + analysis_status = "success" # 有问题,正常处理 + logger.info("检测到文本变更和语法错误,将发布问题报告评论") + elif pr_analysis.has_text_changes and not pr_analysis.has_grammar_errors: + analysis_status = "no_grammar_errors" + logger.info("检测到文本变更但无语法错误,将发布文本质量良好的评论") + elif not pr_analysis.has_text_changes: + analysis_status = "no_text_changes" + logger.info("未检测到文本变更,将发布无文本问题的评论") + else: + analysis_status = "success" + logger.info("检测到需要关注的问题,将发布详细分析评论") + else: + # 如果没有整体分析,检查是否有文件级别的问题 + files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] + if files_with_issues: + analysis_status = "success" + logger.info(f"检测到 {len(files_with_issues)} 个文件有文本问题,将发布文件级别问题评论") + else: + analysis_status = "no_text_changes" + logger.info("未检测到文件级别问题,将发布无问题评论") + + # 总是生成和发布评论 + comment_content = generate_comment_content( + comment_result, + pr_html_url, + analysis_status + ) + + # 发布评论 + success = cli.add_pr_comment(pr_owner, pr_repo, pr_number, comment_content) + if success: + logger.info(f"AI代码审查评论发布成功 - 状态: {analysis_status}") + else: + logger.error(f"AI代码审查评论发布失败 - 状态: {analysis_status}") + + +def main(): + parser = argparse.ArgumentParser(description='Create AI-powered PR comment based on text analysis') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = args.pr_number + siliconflow_api_key = args.siliconflow_api_key + siliconflow_api_base = args.siliconflow_api_base + create_comment_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/comment/create_comment.yaml b/ci/tools/comment/create_comment.yaml new file mode 100644 index 000000000..32ac269ab --- /dev/null +++ b/ci/tools/comment/create_comment.yaml @@ -0,0 +1,38 @@ +# Comment Agent Configuration +comment_agent: + # Backend Configuration + backend: + type: "siliconflow" # Options: "ollama" or "siliconflow" + # siliconflow配置现在通过命令行参数传入 + + # Model Configuration + model: + name: "Qwen/Qwen3-8B" + temperature: 0.1 + max_retry: 5 # For siliconflow backend + + # Processing Configuration + processing: + max_workers: 8 # Number of parallel workers for file processing + single_file_timeout: 180 # Timeout for single file analysis (seconds) + total_comment_timeout: 300 # Timeout for total PR analysis (seconds) + + # Logging Configuration + logging: + level: "INFO" + +# PR Comment Configuration +orgs: + - org_name: openeuler + comment_target_owner: openeuler + comment_target_repo: docs + auto_comment_enabled: true + text_check_enabled: true # 是否启用文本变更检测 + grammar_check_enabled: true # 是否启用语法错误检测 + + - org_name: src-openeuler + comment_target_owner: openeuler + comment_target_repo: globalization + auto_comment_enabled: true + text_check_enabled: true + grammar_check_enabled: true \ No newline at end of file -- Gitee From b05188d8577a7191bf700ab07f925199ff3f62d0 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Sat, 20 Sep 2025 20:39:00 +0800 Subject: [PATCH 3/8] update ci-bot for auto generating translation issue --- ci/tools/comment/comment_agent.py | 958 --------------------------- ci/tools/comment/create_comment.py | 372 ----------- ci/tools/comment/create_comment.yaml | 38 -- 3 files changed, 1368 deletions(-) delete mode 100644 ci/tools/comment/comment_agent.py delete mode 100644 ci/tools/comment/create_comment.py delete mode 100644 ci/tools/comment/create_comment.yaml diff --git a/ci/tools/comment/comment_agent.py b/ci/tools/comment/comment_agent.py deleted file mode 100644 index 25dbe385c..000000000 --- a/ci/tools/comment/comment_agent.py +++ /dev/null @@ -1,958 +0,0 @@ -import json -import re -import logging -import urllib.parse -from typing import List, Dict, Any, Optional, Tuple, Literal -from dataclasses import dataclass -from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError -from pathlib import Path -import sys -import time -# LangChain imports -from langchain_core.prompts import ChatPromptTemplate, PromptTemplate -from langchain_core.runnables import RunnableLambda, RunnablePassthrough -from pydantic import BaseModel, Field, SecretStr -from langchain_community.llms import Ollama -from langchain_ollama import ChatOllama -from langchain.chains import TransformChain, SequentialChain -from langchain_core.output_parsers import JsonOutputParser -from langchain_openai import ChatOpenAI -import yaml - -# ==================== 配置加载 ==================== - -def load_config(config_file="create_comment.yaml"): - """从YAML文件加载配置""" - try: - with open(config_file, 'r', encoding='utf-8') as f: - config = yaml.safe_load(f) - return config.get('comment_agent', {}) - except FileNotFoundError: - print(f"配置文件 {config_file} 不存在") - raise - except yaml.YAMLError as e: - print(f"解析配置文件时发生错误: {e}") - raise - -# 加载配置 -_config = load_config() - -# ==================== 配置常量 ==================== - -BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') -MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') -MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) -MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) -PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) -SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) -TOTAL_COMMENT_TIMEOUT = _config.get('processing', {}).get('total_comment_timeout', 300) -LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') -SILICONFLOW_API_KEY = '' -SILICONFLOW_API_BASE = '' - -# 配置日志 -logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) -logger = logging.getLogger(__name__) - -# ==================== 数据模型定义 ==================== - -class PRAnalysisResult(BaseModel): - """PR分析结果的结构化输出""" - has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) - text_change_type: Literal["无文本改动", "仅标点符号改动", "英文内容改动", "代码注释改动", "混合改动"] = Field(description="文本改动类型") - has_grammar_errors: bool = Field(description="是否存在语法语病错误", default=False) - grammar_errors: List[str] = Field(description="具体的语法语病错误列表", default=[]) - detailed_analysis: str = Field(description="详细分析说明") - suggestions: List[str] = Field(description="改进建议列表", default=[]) - -class FileTextAnalysis(BaseModel): - """单个文件的文本分析""" - file_path: str = Field(description="文件路径", default="") - has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) - text_lines: List[str] = Field(description="涉及文本改动的行", default=[]) - grammar_issues: List[str] = Field(description="语法问题列表", default=[]) - analysis_details: str = Field(description="分析详情") - -@dataclass -class DiffFileInfo: - """单个文件的diff信息""" - file_path: str - diff_content: str - lines_added: int - lines_deleted: int - -@dataclass -class CommentResult: - """评论生成结果""" - pr_analysis: Optional[PRAnalysisResult] - file_analyses: List[FileTextAnalysis] - processed_files: int - total_files: int - error: Optional[str] = None - -# ==================== Token 统计工具 ==================== - - -# ==================== 工具函数 ==================== - -class DiffParser: - """Git Diff 解析器""" - - @staticmethod - def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: - """ - 解析git diff内容,提取每个文件的改动信息 - - Args: - diff_content: git diff的原始内容 - - Returns: - 包含文件路径和对应diff内容的列表 - """ - - files = [] - current_file = None - current_diff = [] - - lines = diff_content.strip().split('\n') - - for line in lines: - # 匹配文件路径行 - if line.startswith('diff --git'): - # 保存前一个文件的信息 - if current_file and current_diff: - diff_info = DiffParser._create_diff_file_info(current_file, current_diff) - if diff_info: - files.append(diff_info) - - # 提取文件路径 - 改进的解析逻辑 - current_file = DiffParser._extract_file_path(line) - if current_file: - current_diff = [line] - else: - current_diff = [] - elif current_file: - current_diff.append(line) - - # 添加最后一个文件 - if current_file and current_diff: - diff_info = DiffParser._create_diff_file_info(current_file, current_diff) - if diff_info: - files.append(diff_info) - - return files - - @staticmethod - def _extract_file_path(diff_line: str) -> Optional[str]: - """ - 从git diff行中提取文件路径,支持包含汉字的文件名 - - Args: - diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" - - Returns: - 提取出的文件路径,如果解析失败则返回None - """ - try: - # 方法1: 处理引号包围的路径(Git对特殊字符的处理) - # 格式: diff --git "a/path/to/file" "b/path/to/file" - quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' - quoted_match = re.match(quoted_pattern, diff_line) - - if quoted_match: - file_path_a = quoted_match.group(1) - file_path_b = quoted_match.group(2) - # 通常a和b路径相同,使用a路径(旧文件路径) - file_path = file_path_a - else: - # 方法2: 使用正则表达式匹配标准的git diff格式 - # 格式: diff --git a/path/to/file b/path/to/file - pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' - match = re.match(pattern, diff_line) - - if match: - file_path_a = match.group(1) - file_path_b = match.group(2) - # 通常a和b路径相同,使用a路径(旧文件路径) - file_path = file_path_a - else: - # 方法3: 如果正则匹配失败,尝试更简单的解析 - # 处理可能包含空格和特殊字符的文件名 - if ' a/' in diff_line and ' b/' in diff_line: - # 找到 a/ 和 b/ 的位置 - a_pos = diff_line.find(' a/') - b_pos = diff_line.find(' b/') - - if a_pos != -1 and b_pos != -1 and a_pos < b_pos: - # 提取a/和b/之间的路径 - a_start = a_pos + 3 # 跳过 ' a/' - file_path = diff_line[a_start:b_pos] - else: - return None - else: - # 方法4: 最后的备选方案,简单的字符串分割 - parts = diff_line.split() - if len(parts) >= 3: - a_path = parts[2] - if a_path.startswith('a/'): - file_path = a_path[2:] # 移除'a/'前缀 - else: - return None - else: - return None - - # 处理文件名编码 - return DiffParser._decode_file_path(file_path) - - except Exception as e: - logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") - return None - - @staticmethod - def _decode_file_path(file_path: str) -> str: - """ - 解码文件路径,处理各种编码情况 - - Args: - file_path: 原始文件路径 - - Returns: - 解码后的文件路径 - """ - try: - # 首先尝试URL解码,处理Git编码的文件名 - decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') - - # 处理Git对特殊字符的引号包装 - if decoded_path.startswith('"') and decoded_path.endswith('"'): - decoded_path = decoded_path[1:-1] - # Git使用反斜杠转义,需要处理转义序列 - decoded_path = decoded_path.replace('\\"', '"') - decoded_path = decoded_path.replace('\\\\', '\\') - - # 无论是否有引号包装,都尝试处理八进制编码 - # 检查是否包含八进制转义序列 - if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): - decoded_path = DiffParser._decode_octal_sequences(decoded_path) - - return decoded_path - - except Exception as e: - logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") - return file_path - - @staticmethod - def _decode_octal_sequences(text: str) -> str: - """ - 解码文本中的八进制转义序列 - - Args: - text: 包含八进制转义序列的文本 - - Returns: - 解码后的文本 - """ - try: - # 查找八进制转义序列模式:\xxx - pattern = r'\\([0-7]{3})' - - # 找到所有八进制序列 - matches = list(re.finditer(pattern, text)) - if not matches: - return text - - # 收集所有字节值 - result = "" - last_end = 0 - bytes_buffer = [] - - for i, match in enumerate(matches): - # 添加匹配前的文本 - if match.start() > last_end: - # 如果有缓冲的字节,先处理它们 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - result += decoded_bytes - bytes_buffer = [] - except UnicodeDecodeError: - # 如果解码失败,保持原始形式 - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - - result += text[last_end:match.start()] - - # 处理当前八进制序列 - octal_str = match.group(1) - try: - byte_value = int(octal_str, 8) - bytes_buffer.append(byte_value) - except ValueError: - # 如果转换失败,添加原始字符串 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - result += decoded_bytes - bytes_buffer = [] - except UnicodeDecodeError: - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - result += match.group(0) - - last_end = match.end() - - # 检查是否是最后一个匹配或下一个匹配不连续 - is_last = (i == len(matches) - 1) - is_next_non_consecutive = (not is_last and - matches[i + 1].start() != match.end()) - - if is_last or is_next_non_consecutive: - # 处理缓冲的字节 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - except UnicodeDecodeError: - # 如果解码失败,保持原始形式 - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - - # 添加剩余的文本 - if last_end < len(text): - result += text[last_end:] - - return result - - except Exception as e: - logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") - return text - - @staticmethod - def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: - """创建DiffFileInfo对象""" - diff_content = '\n'.join(diff_lines) - lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) - - return DiffFileInfo( - file_path=file_path, - diff_content=diff_content, - lines_added=lines_added, - lines_deleted=lines_deleted - ) - - @staticmethod - def _count_lines_changed(diff_content: str) -> Tuple[int, int]: - """统计git diff中改动的行数""" - lines_added, lines_deleted = 0, 0 - lines = diff_content.strip().split('\n') - - for line in lines: - # 统计新增行(以+开头,但不是+++) - if line.startswith('+') and not line.startswith('+++'): - lines_added += 1 - # 统计删除行(以-开头,但不是---) - elif line.startswith('-') and not line.startswith('---'): - lines_deleted += 1 - - return lines_added, lines_deleted - -# ==================== LangChain 组件 ==================== - -class LLMFactory: - """LLM工厂类""" - - @staticmethod - def create_chat_llm(model_name: str = None, base_url: str = None): - """创建LLM实例""" - if model_name is None: - model_name = MODEL_NAME - - if BACKEND_TYPE == "siliconflow": - return ChatOpenAI( - model=model_name, - api_key=SecretStr(SILICONFLOW_API_KEY), - base_url=SILICONFLOW_API_BASE, - temperature=MODEL_TEMPERATURE - ) - else: - raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") - -class PromptTemplates: - """提示模板集合""" - - @staticmethod - def get_file_text_analysis_prompt() -> ChatPromptTemplate: - """获取单文件文本分析提示模板""" - return ChatPromptTemplate.from_messages([ - ("system", f""" -你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。请忽略中文、格式和代码的审计,专注于识别英文文本变更。 - -注意:请忽略中文、格式和代码的审计,专注于识别英文文本变更。如果文档的变更不涉及英文文本,你只需要输出“不涉及英文改动”即可,不需要额外输出任何分析结果。 -同时:对于专有名词,例如openEuler、GitHub等,你不能将其纳入英文文本变更的纠错范围内,而是应该自动识别专有名词。对于代码的相关变更,也不应该纳入分析内容范围。 - -你需要遵循**能不提修改意见就不提修改意见**的原则进行审查!!! - -请仔细分析这个文件的改动,并按照以下要求进行分析: - -**分析重点:** - -1. 英文文本变更识别: - - 检查是否涉及英文文本内容的改动 - - 区分代码逻辑变更和英文文本内容变更 - - 识别注释、文档字符串、用户显示文本等英文文本内容 - - 标识出具体的英文文本变更行 - -2. 语法错误检测: - - 检查英文文本的语法、拼写错误 - -**分析类型判断:** -- 如果改动不涉及任何英文文本内容,标记为"无英文文本改动" -- 如果涉及代码注释的英文文本变更,标记为"代码注释改动" -- 如果涉及文档、界面文本等英文内容变更,标记为"英文内容改动" - -**语法检查重点:** -- 英文:主谓一致、时态、拼写、标点、语序 - -**输出要求:** -- 如果存在英文文本变更但变更不存在语法问题,则直接输出“不存在语法问题”,不需要任何额外输出 -- 详细列出发现的语法错误(如果有) -- 不能超过100个汉字字符 - - """), - ("human", """ -文件路径: {file_path} - -Git Diff 内容: -{diff_content} - - """) - ]) - - @staticmethod - def get_pr_analysis_prompt() -> ChatPromptTemplate: - """获取整体PR分析提示模板""" - return ChatPromptTemplate.from_messages([ - ("system", """ -你是一个专业的PR审查专家,专门分析Gitee文档仓库的翻译PR中的英文文本变更和语法问题。每条PR都是人工生成的文档改动。 - -请分析所有文件的改动,并生成一个综合评估,要求: - -1. 整体文本变更评估: - - 统计涉及文本变更的文件数量 - - 分析文本变更的类型分布 - - 评估变更的重要性和影响范围 - - 如果文本变更不涉及英文,或涉及英文但使用正确不需要改动,则**直接忽略**,无需对其进行总结 - -2. 语法错误汇总: - - **仅汇总改动中的硬伤,如单词拼写错误、英语语法(时态语态)错误等** - - **对于一些可以优化但称不上错误的点,以最小化改动为原则,选择忽略** - - 提高报错阈值,忽略可优化翻译的点 - - 提供优先修复建议 - -3. 质量评估: - - 对整个PR的文本质量给出评分 - - 分析文本变更的一致性 - - 评估对用户体验的影响 - -4. 改进建议: - - 提供具体的修改建议 - - 推荐最佳实践 - - 建议后续的质量控制措施 - -**输出格式要求:** -- 提供清晰的分析结论 -- 按优先级排列发现的问题 -- 给出可操作的改进建议 - - """), - ("human", """ -各个文件的分析结果: -{file_analyses} - -总文件数: {total_files} -涉及文本变更的文件数: {text_changed_files} - """) - ]) - -class FileTextAnalysisChain: - """单文件文本分析任务链""" - - def __init__(self, llm: ChatOpenAI): - self.llm = llm - - # 创建输出解析器 - self.output_parser = JsonOutputParser(pydantic_object=FileTextAnalysis) - - # 为硅基流动平台添加输出格式说明 - format_instructions = """ -请以JSON格式输出,包含以下字段: -{{ - "has_text_changes": "是否涉及英文文本改动(布尔值)", - "text_lines": "涉及文本改动的行(字符串列表)", - "grammar_issues": "语法问题列表(字符串列表)", - "analysis_details": "分析详情(字符串)" -}} -""" - # 创建新的prompt模板 - system_template = """ -你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。 - -**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** - -**严格过滤条件:** -1. 如果文档的变更不涉及英文文本,直接标记为"无英文文本改动",无需任何分析 -2. 如果涉及英文文本但语法完全正确,直接标记为"语法正确,无需关注" -3. 如果仅涉及标点符号的微小调整,直接标记为"仅标点符号改动,无需关注" -4. 对于专有名词(如openEuler、GitHub等),自动识别并忽略,不纳入纠错范围 -5. 对于代码相关变更,不纳入分析内容范围 - -**只关注以下明显错误:** -- 明显的单词拼写错误(如:recieve -> receive) -- 严重的语法错误(如:主谓不一致、时态错误) -- 明显的标点符号错误(如:缺少句号、逗号使用错误) -- 明显的语序错误 - -**忽略以下情况:** -- 语法正确但可以优化的表达 -- 风格偏好问题 -- 轻微的标点符号调整 -- 术语选择的差异 -- 表达方式的个人偏好 - -**输出要求:** -- 如果不存在明显错误,直接输出"语法正确,无需关注" -- 只有发现明显错误时才详细列出 -- 不能超过100个汉字字符 -- 遵循"能不提修改意见就不提修改意见"的原则 - -{format_instructions} -""" - human_template = """ -文件路径: {file_path} - -Git Diff 内容: -{diff_content} -""" - self.prompt = ChatPromptTemplate.from_messages([ - ("system", system_template.format(format_instructions=format_instructions)), - ("human", human_template) - ]) - self.chain = self.prompt | self.llm | self.output_parser - - def analyze(self, diff_file_info: DiffFileInfo) -> Optional[FileTextAnalysis]: - """分析单个文件的文本变更""" - max_retry = MODEL_MAX_RETRY - for attempt in range(1, max_retry + 1): - # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 - if attempt > 1: - delay = min(attempt * 2, 10) # 递增延迟,最多10秒 - logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") - time.sleep(delay) - - try: - # 构造prompt字符串 - prompt_args = { - "file_path": diff_file_info.file_path, - "diff_content": diff_file_info.diff_content - } - - # 直接调用,简化超时控制 - invoke_args = { - "file_path": diff_file_info.file_path, - "diff_content": diff_file_info.diff_content - } - result = self.chain.invoke(invoke_args) - # 验证结果有效性 - if isinstance(result, (dict, FileTextAnalysis)): - if isinstance(result, dict): - result = FileTextAnalysis(**result) - - # 检查结果完整性 - if result and hasattr(result, 'analysis_details') and result.analysis_details: - - # 设置准确值 - result.file_path = diff_file_info.file_path - - # 检查是否只关注明显错误 - analysis_text = result.analysis_details.lower() - if any(phrase in analysis_text for phrase in [ - "语法正确,无需关注", - "无英文文本改动", - "仅标点符号改动,无需关注", - "不存在语法问题" - ]): - # 如果无问题,设置has_text_changes为False - result.has_text_changes = False - result.grammar_issues = [] - - return result - - # 结果无效,记录并重试 - logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") - if attempt < max_retry: - continue - except Exception as e: - err_str = str(e) - # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 - is_http_error = False - for code in ["404", "500", "502", "503", "504"]: - if code in err_str: - is_http_error = True - break - if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): - is_http_error = True - if is_http_error: - logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") - if attempt < max_retry: - time.sleep(10) - continue - else: - logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") - # 其它异常直接进入下一次重试 - if attempt < max_retry: - logger.info(f"第{attempt}次尝试失败,准备重试...") - logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") - return None - -class PRAnalysisChain: - """PR整体分析任务链""" - - def __init__(self, llm: ChatOllama | ChatOpenAI): - self.llm = llm - - # 创建输出解析器 - self.output_parser = JsonOutputParser(pydantic_object=PRAnalysisResult) - - # 为硅基流动平台添加输出格式说明 - format_instructions = """ -请以JSON格式输出,包含以下字段: -{{ - "has_text_changes": "是否涉及英文文本改动(布尔值)", - "text_change_type": "文本改动类型(字符串)", - "has_grammar_errors": "是否存在语法语病错误(布尔值)", - "grammar_errors": "具体的语法语病错误列表(字符串列表)", - "detailed_analysis": "详细分析说明(字符串)", - "suggestions": "改进建议列表(字符串列表)" -}} -""" - # 创建新的prompt模板 - system_template = """ -你是一个专业的PR审查专家,专门分析Pull Request中的文本变更和语法问题。 - -**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** - -请基于各个文件的分析结果,生成整个PR的综合评估,要求: - -1. 严格过滤文件: - - 只统计存在明显错误的文件 - - 忽略"语法正确,无需关注"的文件 - - 忽略"无英文文本改动"的文件 - - 忽略"仅标点符号改动,无需关注"的文件 - -2. 只汇总明显错误: - - 仅汇总硬伤:明显的单词拼写错误、严重的语法错误 - - 忽略可优化但称不上错误的点 - - 忽略风格偏好问题 - - 忽略轻微的标点符号调整 - -3. 质量评估: - - 只对存在明显错误的文件进行质量评估 - - 如果所有文件都无问题,直接标记为"无问题文件" - -4. 改进建议: - - 只对存在明显错误的文件提供修改建议 - - 建议优先修复明显的拼写和语法错误 - -**输出格式要求:** -- 如果所有文件都无问题,直接输出"所有文件语法正确,无需关注" -- 只列出存在明显错误的文件 -- 按优先级排列发现的问题 -- 给出可操作的改进建议 - -{format_instructions} -""" - human_template = """ -各个文件的分析结果: -{file_analyses} - -总文件数: {total_files} -涉及文本变更的文件数: {text_changed_files} -""" - self.prompt = ChatPromptTemplate.from_messages([ - ("system", system_template.format(format_instructions=format_instructions)), - ("human", human_template) - ]) - self.chain = self.prompt | self.llm | self.output_parser - - def generate(self, file_analyses: List[FileTextAnalysis]) -> Optional[PRAnalysisResult]: - """生成PR整体分析""" - try: - total_files = len(file_analyses) - - # 过滤出只关注存在明显错误的文件 - problematic_files = [] - for analysis in file_analyses: - # 检查是否存在明显错误 - has_obvious_errors = ( - analysis.has_text_changes and - analysis.grammar_issues and - len(analysis.grammar_issues) > 0 and - analysis.analysis_details and - not any(phrase in analysis.analysis_details for phrase in [ - "语法正确,无需关注", - "无英文文本改动", - "仅标点符号改动,无需关注", - "不存在语法问题" - ]) - ) - - if has_obvious_errors: - problematic_files.append(analysis) - - # 如果所有文件都无问题,直接返回无问题结果 - if not problematic_files: - return PRAnalysisResult( - has_text_changes=False, - text_change_type="无文本改动", - has_grammar_errors=False, - grammar_errors=[], - detailed_analysis="所有文件语法正确,无需关注", - suggestions=[] - ) - - text_changed_files = len(problematic_files) - - file_analyses_info = [] - for analysis in problematic_files: - file_analyses_info.append({ - 'file_path': analysis.file_path, - 'has_text_changes': analysis.has_text_changes, - 'text_lines': analysis.text_lines, - 'grammar_issues': analysis.grammar_issues, - 'analysis_details': analysis.analysis_details - }) - - # 构造prompt字符串 - prompt_args = { - "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), - "total_files": total_files, - "text_changed_files": text_changed_files - } - - # 使用线程池执行器为PR分析添加超时控制 - timeout_executor = None - try: - timeout_executor = ThreadPoolExecutor(max_workers=1) - invoke_args = { - "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), - "total_files": total_files, - "text_changed_files": text_changed_files - } - result = self.chain.invoke(invoke_args) - # 验证结果有效性 - if isinstance(result, (dict, PRAnalysisResult)): - # 如果是dict(来自JsonOutputParser),转换为PRAnalysisResult - if isinstance(result, dict): - result = PRAnalysisResult(**result) - return result - else: - logger.error(f"生成PR分析时返回类型错误: {type(result)}") - return None - except Exception as e: - logger.error(f"生成PR分析时发生错误: {e}") - return None - except Exception as e: - logger.error(f"生成PR分析时发生错误: {e}") - return None - -# ==================== 主处理类 ==================== - -class PRCommentAnalyzer: - """PR评论分析器""" - - def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): - if model_name is None: - model_name = MODEL_NAME - - # 设置siliconflow API配置 - global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE - if siliconflow_api_key: - SILICONFLOW_API_KEY = siliconflow_api_key - if siliconflow_api_base: - SILICONFLOW_API_BASE = siliconflow_api_base - - self.llm = LLMFactory.create_chat_llm(model_name) - self.file_analysis_chain = FileTextAnalysisChain(self.llm) - self.pr_analysis_chain = PRAnalysisChain(self.llm) - - def cleanup(self): - """清理资源,确保程序能正确退出""" - try: - # 清理 LLM 连接 - if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): - self.llm.client.close() - elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): - self.llm._client.close() - - # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 - if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): - try: - # 强制关闭 httpx 客户端 - if hasattr(self.llm.client, '_client'): - self.llm.client._client.close() - except Exception as e: - logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") - - logger.info("资源清理完成") - except Exception as e: - logger.warning(f"清理资源时发生错误: {e}") - - def analyze_pr_diff(self, diff_content: str, max_workers: int = None) -> CommentResult: - if max_workers is None: - max_workers = PROCESSING_MAX_WORKERS - - logger.info("开始解析PR diff...") - files = DiffParser.parse_git_diff(diff_content) - logger.info(f"解析到 {len(files)} 个文件的改动") - if not files: - logger.warning("未找到任何文件改动") - return CommentResult( - pr_analysis=None, - file_analyses=[], - processed_files=0, - total_files=0, - error='未找到任何文件改动' - ) - - logger.info("开始并行处理各个文件的文本分析...") - file_analyses = [] - # 使用更健壮的并发处理机制 - executor = None - try: - executor = ThreadPoolExecutor(max_workers=max_workers) - future_to_file = { - executor.submit(self.file_analysis_chain.analyze, file_info): file_info.file_path - for file_info in files - } - - # 设置更长的整体超时时间,避免与单个文件超时冲突 - overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 - - completed_count = 0 - total_count = len(future_to_file) - - try: - for future in as_completed(future_to_file, timeout=overall_timeout): - file_path = future_to_file[future] - completed_count += 1 - try: - analysis = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 - if analysis: - file_analyses.append(analysis) - logger.info(f"完成文件 {file_path} 的文本分析 ({completed_count}/{total_count})") - else: - logger.warning(f"文件 {file_path} 的文本分析失败 ({completed_count}/{total_count})") - except (FutureTimeoutError, TimeoutError) as e: - logger.error(f"文件 {file_path} 的文本分析获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") - try: - future.cancel() - except Exception as cancel_e: - logger.warning(f"取消任务时发生错误: {cancel_e}") - except Exception as e: - logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") - except (FutureTimeoutError, TimeoutError) as overall_e: - logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") - # 取消所有未完成的任务 - for future in future_to_file: - if not future.done(): - try: - future.cancel() - except Exception as cancel_e: - logger.warning(f"取消未完成任务时发生错误: {cancel_e}") - finally: - # 确保线程池被正确关闭 - if executor: - try: - executor.shutdown(wait=True) - except Exception as shutdown_e: - logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") - - logger.info(f"成功生成 {len(file_analyses)} 个文件的文本分析") - logger.info("开始生成PR整体分析...") - pr_analysis = None - if file_analyses: - logger.info(f"基于 {len(file_analyses)} 个成功处理的文件生成PR分析...") - try: - pr_analysis = self.pr_analysis_chain.generate(file_analyses) - if pr_analysis: - logger.info("PR整体分析生成成功") - else: - logger.warning("PR整体分析生成失败") - except Exception as e: - logger.error(f"生成PR分析时发生未预期的错误: {e}") - else: - logger.warning("没有成功处理的文件,跳过PR分析生成") - - return CommentResult( - pr_analysis=pr_analysis, - file_analyses=file_analyses, - processed_files=len(file_analyses), - total_files=len(files) - ) - -# ==================== 主函数 ==================== - -def get_comment_analysis(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): - - analyzer = PRCommentAnalyzer(siliconflow_api_key, siliconflow_api_base) - result = None - try: - result = analyzer.analyze_pr_diff(sample_diff) - finally: - # 确保在函数退出前清理资源 - analyzer.cleanup() - - if not result: - print("处理失败,无法获取结果") - return None - - if result.error: - print(f"错误: {result.error}") - - print("\n=== 单文件文本分析 ===") - problematic_files = [f for f in result.file_analyses if f.has_text_changes and f.grammar_issues] - if problematic_files: - for analysis in problematic_files: - print(f"文件: {analysis.file_path}") - print(f"涉及文本变更: {analysis.has_text_changes}") - print(f"文本变更行: {analysis.text_lines}") - print(f"语法问题: {analysis.grammar_issues}") - print(f"分析详情: {analysis.analysis_details}") - print("-" * 50) - else: - print("所有文件语法正确,无需关注") - - print("=== 处理结果 ===") - print(f"总文件数: {result.total_files}") - print(f"成功处理文件数: {result.processed_files}") - - if result.pr_analysis: - print("\n=== PR整体分析 ===") - pr = result.pr_analysis - print(f"涉及文本变更: {pr.has_text_changes}") - print(f"文本变更类型: {pr.text_change_type}") - print(f"存在语法错误: {pr.has_grammar_errors}") - print(f"语法错误列表: {pr.grammar_errors}") - print(f"详细分析: {pr.detailed_analysis}") - print(f"改进建议: {pr.suggestions}") - - - return result - -if __name__ == "__main__": - # 微服务接口逻辑: 传递进来的就是 sample_diff 的内容 - sample_diff = sys.argv[1] - result = get_comment_analysis(sample_diff) - print(result) diff --git a/ci/tools/comment/create_comment.py b/ci/tools/comment/create_comment.py deleted file mode 100644 index 481c7a2f5..000000000 --- a/ci/tools/comment/create_comment.py +++ /dev/null @@ -1,372 +0,0 @@ -import argparse -import json -import logging -import re -import sys -from dataclasses import dataclass, field -from difflib import SequenceMatcher -from typing import TypeVar, Generic -from comment_agent import get_comment_analysis - -import requests -import yaml - -logging.basicConfig(level=logging.INFO, stream=sys.stdout, - format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') -logger = logging.getLogger(__name__) - - -@dataclass -class Org: - org_name: str - comment_target_owner: str - comment_target_repo: str - auto_comment_enabled: bool = field(default=True) - confidence_threshold: float = field(default=0.7) - text_check_enabled: bool = field(default=True) - grammar_check_enabled: bool = field(default=True) - - -@dataclass -class CommentAgentConfig: - backend: dict = field(default_factory=dict) - model: dict = field(default_factory=dict) - processing: dict = field(default_factory=dict) - logging: dict = field(default_factory=dict) - - -@dataclass -class Config: - orgs: list[dict | Org] - comment_agent: dict | CommentAgentConfig = field(default_factory=dict) - - def __post_init__(self): - tmp_orgs: list[Org] = [] - for item in self.orgs: - tmp_orgs.append(Org(**item)) - self.orgs = tmp_orgs - - if isinstance(self.comment_agent, dict) and self.comment_agent: - self.comment_agent = CommentAgentConfig(**self.comment_agent) - - -@dataclass -class ReqArgs: - method: str - url: str - headers: dict[str, str] - params: dict[str, str] | None = field(default=None) - data: str | None = field(default=None) - timeout: int = field(default=180) - - -T = TypeVar('T') -content_type_is_text = "text/plain" -content_type_is_json_dict = {} -content_type_is_json_list = [] - - -def send_request(args: ReqArgs, t: Generic[T]) -> T: - error_count = 0 - while error_count < 3: - try: - resp = requests.request(**args.__dict__) - resp.raise_for_status() - if type(t) is dict or type(t) is list: - res_data: dict | list = resp.json() - else: - res_data: str = resp.text - except requests.exceptions.RequestException as e: - if e.response.status_code in [400, 401, 403, 404, 405]: - logger.error("[ERROR] client error {}".format(e)) - break - logger.error("[ERROR] server error: {}".format(e)) - error_count += 1 - else: - logger.info("[OK] [{}], {}".format(args.method, args.url)) - return res_data - return None - - -class GiteeClient: - """ - Gitee OpenAPI 客户端 - """ - headers = { - "Content-Type": "application/json", - "Accept": "application/json", - } - - def __init__(self, developer_token: str): - """ - 构造函数 - :param developer_token: Gitee v5 token - """ - self.headers["Authorization"] = "Bearer {}".format(developer_token) - - def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: - req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) - req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) - result: str | None = send_request(req_args, "") - if result is None: - logger.error("can not get diff file from PR: {}".format(req_url)) - return result - - def add_pr_comment(self, owner, repo, number, body): - req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) - req_body = { - "body": "### 🤖 AI审查反馈 \n {} ".format(body) - } - req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) - result: dict | None = send_request(req_args, {}) - return result is not None - - - -def get_diff_file_list(diff_content: str) -> list[str]: - diff_files_list = [] - diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] - for diff_file in diff_files: - if diff_file.endswith('\"'): - d = re.compile(r'/[\d\s\S]+') - diff_file = d.findall(diff_file) - diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') - diff_files_list.append(diff_file) - else: - diff_files_list.append(diff_file) - return diff_files_list - - -def generate_comment_content(comment_result, pr_url: str, analysis_status: str = "success") -> str: - """根据分析结果生成评论内容""" - comment_body = "" - - # 根据分析状态添加不同的状态标识 - if analysis_status == "error": - comment_body += "### 分析状态:处理失败\n" - comment_body += "**分析过程中发生错误,无法生成详细反馈。请手动审查文本变更。**\n\n" - elif analysis_status == "low_confidence": - comment_body += "### 分析状态:置信度较低\n" - comment_body += "**当前分析置信度较低,结果仅供参考。建议进行人工审查。**\n\n" - elif analysis_status == "no_text_changes": - comment_body += "### 分析状态:无文本问题\n" - comment_body += "**AI分析结果显示本次PR未发现明显的文本变更或语法问题。无需改动。**\n\n" - elif analysis_status == "no_grammar_errors": - comment_body += "### 分析状态:文本质量良好\n" - comment_body += "**检测到文本变更,但未发现明显的语法错误,文本质量良好。无需改动。**\n\n" - else: # success with issues - comment_body += "### 分析状态:发现需要关注的问题\n" - comment_body += "**AI分析发现了一些文本变更或语法问题,请查看下方详细信息。**\n\n" - - # 如果有分析结果,添加详细信息 - if comment_result and not comment_result.error: - # 如果有PR整体分析 - if comment_result.pr_analysis: - pr_analysis = comment_result.pr_analysis - - # 添加整体评估摘要 - comment_body += "## 整体评估\n" - comment_body += f"- 涉及文本变更: {'是' if pr_analysis.has_text_changes else '否'}\n" - comment_body += f"- 文本变更类型: {pr_analysis.text_change_type}\n" - comment_body += f"- 存在语法错误: {'是' if pr_analysis.has_grammar_errors else '否'}\n\n" - - # 添加详细分析 - if pr_analysis.detailed_analysis: - comment_body += "## 详细分析\n" - comment_body += f"{pr_analysis.detailed_analysis}\n\n" - - # 添加语法错误列表 - if pr_analysis.grammar_errors: - comment_body += "## 语法问题\n" - for i, error in enumerate(pr_analysis.grammar_errors, 1): - comment_body += f"{i}. {error}\n" - comment_body += "\n" - - # 添加改进建议 - if pr_analysis.suggestions: - comment_body += "## 改进建议\n" - for i, suggestion in enumerate(pr_analysis.suggestions, 1): - comment_body += f"{i}. {suggestion}\n" - comment_body += "\n" - - # 添加文件级别的分析结果 - if comment_result.file_analyses: - # comment_body += "## 文件分析\n" - - # 统计有问题的文件 - files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] - files_without_issues = [f for f in comment_result.file_analyses if not f.has_text_changes and not f.grammar_issues] - - if files_with_issues: - comment_body += f"### 需要关注的文件 ({len(files_with_issues)} 个)\n" - for i, file_analysis in enumerate(files_with_issues, 1): - comment_body += f"\n**{i}. {file_analysis.file_path}**\n" - - if file_analysis.has_text_changes: - comment_body += f"- 文本变更: 检测到英文文本改动\n" - if file_analysis.text_lines: - comment_body += f"- 涉及行数: {len(file_analysis.text_lines)} 行\n" - - if file_analysis.grammar_issues: - comment_body += f"- 语法问题: 发现 {len(file_analysis.grammar_issues)} 个问题\n" - for j, issue in enumerate(file_analysis.grammar_issues, 1): - comment_body += f" {j}. {issue}\n" - - if file_analysis.analysis_details: - comment_body += f"- 分析详情: {file_analysis.analysis_details}\n" - - if files_without_issues: - comment_body += f"\n### 无问题的文件 ({len(files_without_issues)} 个)\n" - for file_analysis in files_without_issues: - comment_body += f"- {file_analysis.file_path}\n" - - # 添加处理统计 - # comment_body += f"\n### 处理统计\n" - # comment_body += f"- 总文件数: {comment_result.total_files}\n" - # comment_body += f"- 成功分析: {comment_result.processed_files}\n" - # comment_body += f"- 有文本变更: {len([f for f in comment_result.file_analyses if f.has_text_changes])}\n" - # comment_body += f"- 有语法问题: {len([f for f in comment_result.file_analyses if f.grammar_issues])}\n" - - # 添加免责声明 - comment_body += "## 免责声明\n" - comment_body += "本评论内容基于AI Agent技术自动生成,仅供参考。请开发者根据实际情况进行判断和修改。\n" - - return comment_body - - -class Args: - gitee_token: str - pr_owner: str - pr_repo: str - pr_number: int - siliconflow_api_key: str = "" - siliconflow_api_base: str = "https://api.siliconflow.cn/v1" - - def validate(self): - valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number - if not valid: - logger.error("Invalid Command Arguments") - sys.exit(1) - - -def load_config_yaml(yaml_path): - with open(yaml_path, "r", encoding="utf-8") as config_in: - data = yaml.safe_load(config_in) - - if data is None: - return None - return Config(**data) - - -def create_comment_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, - pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): - pr_html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) - - for org_item in conf.orgs: - if org_item.org_name != pr_owner: - continue - - if not org_item.auto_comment_enabled: - logger.info(f"组织 {org_item.org_name} 未启用自动评论功能") - continue - - # 移除文件筛选逻辑,对所有PR平等处理 - logger.info("开始对PR进行全面文本分析(不限制文件类型和路径)") - - # 获取diff内容 - diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) - if diff_content is None: - logger.error("无法获取PR的diff内容") - sys.exit(1) - - # 调用AI Agent进行分析 - logger.info("开始进行AI代码审查分析...") - comment_result = get_comment_analysis(diff_content, siliconflow_api_key, siliconflow_api_base) - - if not comment_result: - logger.error("AI分析失败,将发布错误状态评论") - # 创建一个错误结果对象,确保能发布评论 - from comment_agent import CommentResult - comment_result = CommentResult( - pr_analysis=None, - file_analyses=[], - processed_files=0, - total_files=0, - error="AI分析过程失败" - ) - - # 确定分析状态和评论内容 - analysis_status = "success" - - if comment_result.error: - analysis_status = "error" - logger.info("AI分析过程出错,将发布错误状态评论") - elif comment_result.pr_analysis: - pr_analysis = comment_result.pr_analysis - - # 检查是否有文本变更或语法错误 - if pr_analysis.has_text_changes and pr_analysis.has_grammar_errors: - analysis_status = "success" # 有问题,正常处理 - logger.info("检测到文本变更和语法错误,将发布问题报告评论") - elif pr_analysis.has_text_changes and not pr_analysis.has_grammar_errors: - analysis_status = "no_grammar_errors" - logger.info("检测到文本变更但无语法错误,将发布文本质量良好的评论") - elif not pr_analysis.has_text_changes: - analysis_status = "no_text_changes" - logger.info("未检测到文本变更,将发布无文本问题的评论") - else: - analysis_status = "success" - logger.info("检测到需要关注的问题,将发布详细分析评论") - else: - # 如果没有整体分析,检查是否有文件级别的问题 - files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] - if files_with_issues: - analysis_status = "success" - logger.info(f"检测到 {len(files_with_issues)} 个文件有文本问题,将发布文件级别问题评论") - else: - analysis_status = "no_text_changes" - logger.info("未检测到文件级别问题,将发布无问题评论") - - # 总是生成和发布评论 - comment_content = generate_comment_content( - comment_result, - pr_html_url, - analysis_status - ) - - # 发布评论 - success = cli.add_pr_comment(pr_owner, pr_repo, pr_number, comment_content) - if success: - logger.info(f"AI代码审查评论发布成功 - 状态: {analysis_status}") - else: - logger.error(f"AI代码审查评论发布失败 - 状态: {analysis_status}") - - -def main(): - parser = argparse.ArgumentParser(description='Create AI-powered PR comment based on text analysis') - parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') - parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') - parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') - parser.add_argument('--pr_number', type=str, required=True, help='the PR number') - parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') - parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') - args = Args() - parser.parse_args(args=sys.argv[1:], namespace=args) - args.validate() - - exec_py = sys.argv[0] - config_yaml_path = exec_py[:-2] + 'yaml' - conf = load_config_yaml(config_yaml_path) - - cli = GiteeClient(args.gitee_token) - - pr_owner = args.pr_owner - pr_repo = args.pr_repo - pr_number = args.pr_number - siliconflow_api_key = args.siliconflow_api_key - siliconflow_api_base = args.siliconflow_api_base - create_comment_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) - - -if __name__ == '__main__': - main() diff --git a/ci/tools/comment/create_comment.yaml b/ci/tools/comment/create_comment.yaml deleted file mode 100644 index 32ac269ab..000000000 --- a/ci/tools/comment/create_comment.yaml +++ /dev/null @@ -1,38 +0,0 @@ -# Comment Agent Configuration -comment_agent: - # Backend Configuration - backend: - type: "siliconflow" # Options: "ollama" or "siliconflow" - # siliconflow配置现在通过命令行参数传入 - - # Model Configuration - model: - name: "Qwen/Qwen3-8B" - temperature: 0.1 - max_retry: 5 # For siliconflow backend - - # Processing Configuration - processing: - max_workers: 8 # Number of parallel workers for file processing - single_file_timeout: 180 # Timeout for single file analysis (seconds) - total_comment_timeout: 300 # Timeout for total PR analysis (seconds) - - # Logging Configuration - logging: - level: "INFO" - -# PR Comment Configuration -orgs: - - org_name: openeuler - comment_target_owner: openeuler - comment_target_repo: docs - auto_comment_enabled: true - text_check_enabled: true # 是否启用文本变更检测 - grammar_check_enabled: true # 是否启用语法错误检测 - - - org_name: src-openeuler - comment_target_owner: openeuler - comment_target_repo: globalization - auto_comment_enabled: true - text_check_enabled: true - grammar_check_enabled: true \ No newline at end of file -- Gitee From e609ffd74583e77b1920856f2d88aed58774cf07 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Mon, 22 Sep 2025 14:50:45 +0800 Subject: [PATCH 4/8] update for merge --- .../new_create_translation_issue.py | 742 ++++++++---------- .../new_create_translation_issue.yaml | 69 +- .../new_create_translation_issue_AI.py | 402 ++++++++++ .../new_create_translation_issue_AI.yaml | 49 ++ 4 files changed, 812 insertions(+), 450 deletions(-) mode change 100755 => 100644 ci/tools/translation/new_create_translation_issue.py mode change 100755 => 100644 ci/tools/translation/new_create_translation_issue.yaml create mode 100755 ci/tools/translation/new_create_translation_issue_AI.py create mode 100755 ci/tools/translation/new_create_translation_issue_AI.yaml diff --git a/ci/tools/translation/new_create_translation_issue.py b/ci/tools/translation/new_create_translation_issue.py old mode 100755 new mode 100644 index 0a61c7136..f5e3f8f8a --- a/ci/tools/translation/new_create_translation_issue.py +++ b/ci/tools/translation/new_create_translation_issue.py @@ -1,402 +1,340 @@ -import argparse -import json -import logging -import re -import sys -from dataclasses import dataclass, field -from difflib import SequenceMatcher -from typing import TypeVar, Generic -from translation_agent import get_agent_summary - -import requests -import yaml - -logging.basicConfig(level=logging.INFO, stream=sys.stdout, - format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') -logger = logging.getLogger(__name__) - - -@dataclass -class IssueTrigger: - trigger_pr_path: str - issue_title: str - issue_assignee: str - file_extension: list[str] = field(default_factory=list) - - -@dataclass -class Org: - org_name: str - issue_of_owner: str - issue_of_repo: str - auto_create_issue: bool - issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) - change_content_exclude: list[str] = field(default_factory=list) - - def __post_init__(self): - tmp_issue_triggers: list[IssueTrigger] = [] - for item in self.issue_triggers: - tmp_issue_triggers.append(IssueTrigger(**item)) - self.issue_triggers = tmp_issue_triggers - - -@dataclass -class TranslationAgentConfig: - backend: dict = field(default_factory=dict) - model: dict = field(default_factory=dict) - processing: dict = field(default_factory=dict) - logging: dict = field(default_factory=dict) - - -@dataclass -class Config: - orgs: list[dict | Org] - translation_agent: dict | TranslationAgentConfig = field(default_factory=dict) - - def __post_init__(self): - tmp_orgs: list[Org] = [] - for item in self.orgs: - tmp_orgs.append(Org(**item)) - self.orgs = tmp_orgs - - if isinstance(self.translation_agent, dict) and self.translation_agent: - self.translation_agent = TranslationAgentConfig(**self.translation_agent) - - -@dataclass -class ReqArgs: - method: str - url: str - headers: dict[str, str] - params: dict[str, str] | None = field(default=None) - data: str | None = field(default=None) - timeout: int = field(default=180) - - -T = TypeVar('T') -content_type_is_text = "text/plain" -content_type_is_json_dict = {} -content_type_is_json_list = [] - - -def send_request(args: ReqArgs, t: Generic[T]) -> T: - error_count = 0 - while error_count < 3: - try: - resp = requests.request(**args.__dict__) - resp.raise_for_status() - if type(t) is dict or type(t) is list: - res_data: dict | list = resp.json() - else: - res_data: str = resp.text - except requests.exceptions.RequestException as e: - if e.response.status_code in [400, 401, 403, 404, 405]: - logger.error("[ERROR] client error {}".format(e)) - break - logger.error("[ERROR] server error: {}".format(e)) - error_count += 1 - else: - logger.info("[OK] [{}], {}".format(args.method, args.url)) - return res_data - return None - - -class GiteeClient: - """ - Gitee OpenAPI 客户端 - """ - headers = { - "Content-Type": "application/json", - "Accept": "application/json", - } - - def __init__(self, developer_token: str): - """ - 构造函数 - :param developer_token: Gitee v5 token - """ - self.headers["Authorization"] = "Bearer {}".format(developer_token) - - def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: - req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) - req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) - result: str | None = send_request(req_args, "") - if result is None: - logger.error("can not get diff file from PR: {}".format(req_url)) - return result - - def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: - req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) - page = 1 - existed_issues = [] - while page <= 200: - query = { - "per_page": 100, - "page": page, - "sort": "created", - "direction": "desc", - } - req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) - result: list | None = send_request(req_args, []) - if result is None: - break - page += 1 - for item in result: - if not issue_titles: - return [], existed_issues - if issue_titles and item.get('title') in issue_titles: - issue_titles.remove(item.get('title')) - existed_issues.append(item.get('html_url')) - if len(result) < 100: - break - return issue_titles, existed_issues - - def create_issue(self, owner, repo, title, assignee, body): - req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) - req_body = { - "repo": repo, - "title": title, - "issue_type": "翻译", - "body": body, - "assignee": assignee, - "push_events": False, - "tag_push_events": False, - "issues_events": False, - } - req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) - result: dict | None = send_request(req_args, {}) - return result is None - - def add_pr_comment(self, owner, repo, number, body): - req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) - req_body = { - "body": "### Translation Feedback \n {} ".format(body) - } - req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) - result: dict | None = send_request(req_args, {}) - return result is None - - def check_only_marks_changed(self, owner, repo, number, check_list): - diff_content = self.get_diff_content(owner, repo, number) - deleted_strs, inserted_strs = get_diff_content_list(diff_content) - if is_only_marks_changed(deleted_strs, inserted_strs, check_list): - logger.warning('Only marks changed, skip the following steps') - sys.exit(1) - logger.info('Not just only marks changed, continue creating issue') - - -def get_diff_file_list(diff_content: str) -> list[str]: - diff_files_list = [] - diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] - for diff_file in diff_files: - if diff_file.endswith('\"'): - d = re.compile(r'/[\d\s\S]+') - diff_file = d.findall(diff_file) - diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') - diff_files_list.append(diff_file) - else: - diff_files_list.append(diff_file) - return diff_files_list - - -def get_diff_content_list(diff_content: str) -> tuple[str, str]: - pieces = diff_content.split('diff --git') - deleted_strs = '' - inserted_strs = '' - for piece in pieces: - start = False - for line in piece.splitlines(): - if line.startswith('@@'): - start = True - continue - if not start: - continue - if line.startswith('-'): - if len(line) == 1: - deleted_strs += '\n' - else: - deleted_strs += line[1:] - elif line.startswith('+'): - if len(line) == 1: - inserted_strs += '\n' - else: - inserted_strs += line[1:] - return deleted_strs, inserted_strs - - -def is_only_marks_changed(a, b, check_list): - s = SequenceMatcher(None, a, b) - for tag, i1, i2, j1, j2 in s.get_opcodes(): - if tag == 'equal': - continue - elif tag in ['delete', 'insert']: - return False - elif tag == 'replace': - deleted = ''.join(a[i1:i2]).strip() - inserted = ''.join(b[j1:j2]).strip() - if deleted not in check_list or inserted not in check_list: - return False - return True - - -class Args: - gitee_token: str - pr_owner: str - pr_repo: str - pr_number: int - siliconflow_api_key: str = "" - siliconflow_api_base: str = "https://api.siliconflow.cn/v1" - - def validate(self): - valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number - if not valid: - logger.error("Invalid Command Arguments") - sys.exit(1) - - -def load_config_yaml(yaml_path): - with open(yaml_path, "r", encoding="utf-8") as config_in: - data = yaml.safe_load(config_in) - - if data is None: - return None - return Config(**data) - - -def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, - pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): - pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) - for org_item in conf.orgs: - issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) - if org_item.org_name != pr_owner: - continue - # 旧标点符号判断逻辑,已弃用 - # if org_item.auto_create_issue: - # cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) - file_count = 0 - diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) - if diff_content is None: - sys.exit(1) - diff_files = get_diff_file_list(diff_content) - zh_file = [] - en_file = [] - need_create_issue = {} - for trigger in org_item.issue_triggers: - for diff_file in diff_files: - if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: - logger.info("file {} has been changed".format(diff_file)) - file_count += 1 - if "/zh" in trigger.trigger_pr_path: - need_create_issue["zh"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - zh_file.append(diff_file.replace("zh/", "")) - elif "/en" in trigger.trigger_pr_path: - need_create_issue["en"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - en_file.append(diff_file.replace("en/", "")) - else: - logger.warning("not a range") - changed_same_files = False - for z in zh_file: - if z in en_file: - changed_same_files = True - else: - changed_same_files = False - if file_count == 0: - logger.warning( - "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) - return - if changed_same_files: - logger.info("changed the same files in en and zh path, no need to create issue") - return - - need_create_issue_template = {} - need_create_issue_titles = [] - for issue_item in need_create_issue: - need_create_issue_titles.append(need_create_issue[issue_item][1]) - need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] - if need_create_issue_titles: - - need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, - org_item.issue_of_repo, - need_create_issue_titles) - - if not need_create_issue_list: - feedback_comment = "issue has already created, please go to check issue: {}".format( - existed_issue_list) - logger.info("Warning: " + feedback_comment) - cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) - for need_create_issue_item in need_create_issue_list: - - issue_summary = get_agent_summary(diff_content, siliconflow_api_key, siliconflow_api_base) - issue_body = "" - if issue_summary and not issue_summary.error: - issue_body += f"## 📊 变更统计\n\n" - issue_body += f"- **总文件数**: {issue_summary.total_files}\n" - issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" - if issue_summary.total_files != issue_summary.processed_files: - # 注意人工审查提醒 - issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" - issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" - if issue_summary.total_summary: - total = issue_summary.total_summary - issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" - issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" - issue_body += f"## 🔍 整体变更摘要\n\n" - issue_body += f"{total.overall_summary}\n\n" - issue_body += f"## ⚠️ 整体潜在影响\n\n" - issue_body += f"{total.overall_potential_impact}\n\n" - if issue_summary.file_summaries: - issue_body += f"## 📝 单文件变更详情\n\n" - for summary in issue_summary.file_summaries: - issue_body += f"### 📁 {summary.file_path}\n\n" - issue_body += f"- **改动类型**: {summary.change_type}\n" - issue_body += f"- **新增行数**: {summary.lines_added}\n" - issue_body += f"- **删除行数**: {summary.lines_deleted}\n" - issue_body += f"- **潜在影响**: {summary.potential_impact}\n" - issue_body += f"- **详细摘要**: {summary.summary}\n\n" - issue_body += "---\n\n" - else: - issue_body += f"## ⚠️ 翻译变更检测\n\n" - issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" - issue_body += f"**变更文件数量**: {len(diff_files)}\n" - issue_body += f"**相关PR**: {pr__html_url}\n\n" - - issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" - issue_body += f"## 🔗 相关PR链接\n\n" - issue_body += f"- {pr__html_url}\n" - - cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, - need_create_issue_template[need_create_issue_item], - issue_body) - - - -def main(): - parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') - parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') - parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') - parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') - parser.add_argument('--pr_number', type=str, required=True, help='the PR number') - parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') - parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') - args = Args() - parser.parse_args(args=sys.argv[1:], namespace=args) - args.validate() - - exec_py = sys.argv[0] - config_yaml_path = exec_py[:-2] + 'yaml' - conf = load_config_yaml(config_yaml_path) - - cli = GiteeClient(args.gitee_token) - - pr_owner = args.pr_owner - pr_repo = args.pr_repo - pr_number = args.pr_number - siliconflow_api_key = args.siliconflow_api_key - siliconflow_api_base = args.siliconflow_api_base - create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) - - -if __name__ == '__main__': - main() +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class IssueTrigger: + trigger_pr_path: str + issue_title: str + issue_assignee: str + file_extension: list[str] = field(default_factory=list) + + +@dataclass +class Org: + org_name: str + issue_of_owner: str + issue_of_repo: str + auto_create_issue: bool + issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) + change_content_exclude: list[str] = field(default_factory=list) + + def __post_init__(self): + tmp_issue_triggers: list[IssueTrigger] = [] + for item in self.issue_triggers: + tmp_issue_triggers.append(IssueTrigger(**item)) + self.issue_triggers = tmp_issue_triggers + + +@dataclass +class Config: + orgs: list[dict | Org] + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') +content_type_is_text = "text/plain" +content_type_is_json_dict = {} +content_type_is_json_list = [] + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: + req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) + page = 1 + existed_issues = [] + while page <= 200: + query = { + "per_page": 100, + "page": page, + "sort": "created", + "direction": "desc", + } + req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) + result: list | None = send_request(req_args, []) + if result is None: + break + page += 1 + for item in result: + if not issue_titles: + return [], existed_issues + if issue_titles and item.get('title') in issue_titles: + issue_titles.remove(item.get('title')) + existed_issues.append(item.get('html_url')) + if len(result) < 100: + break + return issue_titles, existed_issues + + def create_issue(self, owner, repo, title, assignee, body): + req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) + req_body = { + "repo": repo, + "title": title, + "issue_type": "翻译", + "body": body, + "assignee": assignee, + "push_events": False, + "tag_push_events": False, + "issues_events": False, + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### Translation Feedback \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def check_only_marks_changed(self, owner, repo, number, check_list): + diff_content = self.get_diff_content(owner, repo, number) + deleted_strs, inserted_strs = get_diff_content_list(diff_content) + if is_only_marks_changed(deleted_strs, inserted_strs, check_list): + logger.warning('Only marks changed, skip the following steps') + sys.exit(1) + logger.info('Not just only marks changed, continue creating issue') + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def get_diff_content_list(diff_content: str) -> tuple[str, str]: + pieces = diff_content.split('diff --git') + deleted_strs = '' + inserted_strs = '' + for piece in pieces: + start = False + for line in piece.splitlines(): + if line.startswith('@@'): + start = True + continue + if not start: + continue + if line.startswith('-'): + if len(line) == 1: + deleted_strs += '\n' + else: + deleted_strs += line[1:] + elif line.startswith('+'): + if len(line) == 1: + inserted_strs += '\n' + else: + inserted_strs += line[1:] + return deleted_strs, inserted_strs + + +def is_only_marks_changed(a, b, check_list): + s = SequenceMatcher(None, a, b) + for tag, i1, i2, j1, j2 in s.get_opcodes(): + if tag == 'equal': + continue + elif tag in ['delete', 'insert']: + return False + elif tag == 'replace': + deleted = ''.join(a[i1:i2]).strip() + inserted = ''.join(b[j1:j2]).strip() + if deleted not in check_list or inserted not in check_list: + return False + return True + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int): + pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + for org_item in conf.orgs: + issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + if org_item.org_name != pr_owner: + continue + if org_item.auto_create_issue: + cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) + file_count = 0 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + sys.exit(1) + diff_files = get_diff_file_list(diff_content) + zh_file = [] + en_file = [] + need_create_issue = {} + for trigger in org_item.issue_triggers: + for diff_file in diff_files: + if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: + logger.info("file {} has been changed".format(diff_file)) + file_count += 1 + if "/zh" in trigger.trigger_pr_path: + need_create_issue["zh"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + zh_file.append(diff_file.replace("zh/", "")) + elif "/en" in trigger.trigger_pr_path: + need_create_issue["en"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + en_file.append(diff_file.replace("en/", "")) + else: + logger.warning("not a range") + changed_same_files = False + for z in zh_file: + if z in en_file: + changed_same_files = True + else: + changed_same_files = False + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) + return + if changed_same_files: + logger.info("changed the same files in en and zh path, no need to create issue") + return + + need_create_issue_template = {} + need_create_issue_titles = [] + for issue_item in need_create_issue: + need_create_issue_titles.append(need_create_issue[issue_item][1]) + need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] + if need_create_issue_titles: + need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, + org_item.issue_of_repo, + need_create_issue_titles) + if not need_create_issue_list: + feedback_comment = "issue has already created, please go to check issue: {}".format( + existed_issue_list) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + for need_create_issue_item in need_create_issue_list: + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, + need_create_issue_template[need_create_issue_item], + "### Related PR link \n - {}".format(pr__html_url)) + + +def main(): + parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = args.pr_number + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/translation/new_create_translation_issue.yaml b/ci/tools/translation/new_create_translation_issue.yaml old mode 100755 new mode 100644 index bc48ab7a2..07e6c76d0 --- a/ci/tools/translation/new_create_translation_issue.yaml +++ b/ci/tools/translation/new_create_translation_issue.yaml @@ -1,49 +1,22 @@ -# Translation Agent Configuration -translation_agent: - # Backend Configuration - backend: - type: "siliconflow" # Options: "ollama" or "siliconflow" - # siliconflow配置现在通过命令行参数传入 - ollama: - base_url: "http://localhost:11434" - - # Model Configuration - model: - name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others - temperature: 0.1 - max_retry: 5 # For siliconflow backend - max_retry_ollama: 1 # For ollama backend - - # Processing Configuration - processing: - max_workers: 8 # Number of parallel workers for file processing - single_file_timeout: 180 # Timeout for single file summary generation (seconds) - total_summary_timeout: 300 # Timeout for total summary generation (seconds) - - # Logging Configuration - logging: - level: "INFO" - -# Issue Creation Configuration -orgs: - - org_name: openeuler - issue_of_owner: openeuler - issue_of_repo: globalization - auto_create_issue: true - issue_triggers: - - trigger_pr_path: 'docs/zh' - issue_title: "[Auto] This is an English translation issue for the PR" - issue_assignee: judithsq - file_extension: [ doc, md, json ] - change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] - - - org_name: src-openeuler - issue_of_owner: openeuler - issue_of_repo: globalization - auto_create_issue: true - issue_triggers: - - trigger_pr_path: 'docs/zh' - issue_title: "[Auto] This is an English translation issue for the PR" - issue_assignee: judithsq - file_extension: [ doc, md, json ] +orgs: + - org_name: openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] + + - org_name: src-openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] \ No newline at end of file diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py new file mode 100755 index 000000000..0a61c7136 --- /dev/null +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -0,0 +1,402 @@ +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic +from translation_agent import get_agent_summary + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class IssueTrigger: + trigger_pr_path: str + issue_title: str + issue_assignee: str + file_extension: list[str] = field(default_factory=list) + + +@dataclass +class Org: + org_name: str + issue_of_owner: str + issue_of_repo: str + auto_create_issue: bool + issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) + change_content_exclude: list[str] = field(default_factory=list) + + def __post_init__(self): + tmp_issue_triggers: list[IssueTrigger] = [] + for item in self.issue_triggers: + tmp_issue_triggers.append(IssueTrigger(**item)) + self.issue_triggers = tmp_issue_triggers + + +@dataclass +class TranslationAgentConfig: + backend: dict = field(default_factory=dict) + model: dict = field(default_factory=dict) + processing: dict = field(default_factory=dict) + logging: dict = field(default_factory=dict) + + +@dataclass +class Config: + orgs: list[dict | Org] + translation_agent: dict | TranslationAgentConfig = field(default_factory=dict) + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + if isinstance(self.translation_agent, dict) and self.translation_agent: + self.translation_agent = TranslationAgentConfig(**self.translation_agent) + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') +content_type_is_text = "text/plain" +content_type_is_json_dict = {} +content_type_is_json_list = [] + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: + req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) + page = 1 + existed_issues = [] + while page <= 200: + query = { + "per_page": 100, + "page": page, + "sort": "created", + "direction": "desc", + } + req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) + result: list | None = send_request(req_args, []) + if result is None: + break + page += 1 + for item in result: + if not issue_titles: + return [], existed_issues + if issue_titles and item.get('title') in issue_titles: + issue_titles.remove(item.get('title')) + existed_issues.append(item.get('html_url')) + if len(result) < 100: + break + return issue_titles, existed_issues + + def create_issue(self, owner, repo, title, assignee, body): + req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) + req_body = { + "repo": repo, + "title": title, + "issue_type": "翻译", + "body": body, + "assignee": assignee, + "push_events": False, + "tag_push_events": False, + "issues_events": False, + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### Translation Feedback \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def check_only_marks_changed(self, owner, repo, number, check_list): + diff_content = self.get_diff_content(owner, repo, number) + deleted_strs, inserted_strs = get_diff_content_list(diff_content) + if is_only_marks_changed(deleted_strs, inserted_strs, check_list): + logger.warning('Only marks changed, skip the following steps') + sys.exit(1) + logger.info('Not just only marks changed, continue creating issue') + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def get_diff_content_list(diff_content: str) -> tuple[str, str]: + pieces = diff_content.split('diff --git') + deleted_strs = '' + inserted_strs = '' + for piece in pieces: + start = False + for line in piece.splitlines(): + if line.startswith('@@'): + start = True + continue + if not start: + continue + if line.startswith('-'): + if len(line) == 1: + deleted_strs += '\n' + else: + deleted_strs += line[1:] + elif line.startswith('+'): + if len(line) == 1: + inserted_strs += '\n' + else: + inserted_strs += line[1:] + return deleted_strs, inserted_strs + + +def is_only_marks_changed(a, b, check_list): + s = SequenceMatcher(None, a, b) + for tag, i1, i2, j1, j2 in s.get_opcodes(): + if tag == 'equal': + continue + elif tag in ['delete', 'insert']: + return False + elif tag == 'replace': + deleted = ''.join(a[i1:i2]).strip() + inserted = ''.join(b[j1:j2]).strip() + if deleted not in check_list or inserted not in check_list: + return False + return True + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + siliconflow_api_key: str = "" + siliconflow_api_base: str = "https://api.siliconflow.cn/v1" + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): + pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + for org_item in conf.orgs: + issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + if org_item.org_name != pr_owner: + continue + # 旧标点符号判断逻辑,已弃用 + # if org_item.auto_create_issue: + # cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) + file_count = 0 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + sys.exit(1) + diff_files = get_diff_file_list(diff_content) + zh_file = [] + en_file = [] + need_create_issue = {} + for trigger in org_item.issue_triggers: + for diff_file in diff_files: + if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: + logger.info("file {} has been changed".format(diff_file)) + file_count += 1 + if "/zh" in trigger.trigger_pr_path: + need_create_issue["zh"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + zh_file.append(diff_file.replace("zh/", "")) + elif "/en" in trigger.trigger_pr_path: + need_create_issue["en"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + en_file.append(diff_file.replace("en/", "")) + else: + logger.warning("not a range") + changed_same_files = False + for z in zh_file: + if z in en_file: + changed_same_files = True + else: + changed_same_files = False + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) + return + if changed_same_files: + logger.info("changed the same files in en and zh path, no need to create issue") + return + + need_create_issue_template = {} + need_create_issue_titles = [] + for issue_item in need_create_issue: + need_create_issue_titles.append(need_create_issue[issue_item][1]) + need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] + if need_create_issue_titles: + + need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, + org_item.issue_of_repo, + need_create_issue_titles) + + if not need_create_issue_list: + feedback_comment = "issue has already created, please go to check issue: {}".format( + existed_issue_list) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + for need_create_issue_item in need_create_issue_list: + + issue_summary = get_agent_summary(diff_content, siliconflow_api_key, siliconflow_api_base) + issue_body = "" + if issue_summary and not issue_summary.error: + issue_body += f"## 📊 变更统计\n\n" + issue_body += f"- **总文件数**: {issue_summary.total_files}\n" + issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" + if issue_summary.total_files != issue_summary.processed_files: + # 注意人工审查提醒 + issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" + issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" + if issue_summary.total_summary: + total = issue_summary.total_summary + issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" + issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" + issue_body += f"## 🔍 整体变更摘要\n\n" + issue_body += f"{total.overall_summary}\n\n" + issue_body += f"## ⚠️ 整体潜在影响\n\n" + issue_body += f"{total.overall_potential_impact}\n\n" + if issue_summary.file_summaries: + issue_body += f"## 📝 单文件变更详情\n\n" + for summary in issue_summary.file_summaries: + issue_body += f"### 📁 {summary.file_path}\n\n" + issue_body += f"- **改动类型**: {summary.change_type}\n" + issue_body += f"- **新增行数**: {summary.lines_added}\n" + issue_body += f"- **删除行数**: {summary.lines_deleted}\n" + issue_body += f"- **潜在影响**: {summary.potential_impact}\n" + issue_body += f"- **详细摘要**: {summary.summary}\n\n" + issue_body += "---\n\n" + else: + issue_body += f"## ⚠️ 翻译变更检测\n\n" + issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" + issue_body += f"**变更文件数量**: {len(diff_files)}\n" + issue_body += f"**相关PR**: {pr__html_url}\n\n" + + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" + issue_body += f"## 🔗 相关PR链接\n\n" + issue_body += f"- {pr__html_url}\n" + + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, + need_create_issue_template[need_create_issue_item], + issue_body) + + + +def main(): + parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = args.pr_number + siliconflow_api_key = args.siliconflow_api_key + siliconflow_api_base = args.siliconflow_api_base + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/translation/new_create_translation_issue_AI.yaml b/ci/tools/translation/new_create_translation_issue_AI.yaml new file mode 100755 index 000000000..bc48ab7a2 --- /dev/null +++ b/ci/tools/translation/new_create_translation_issue_AI.yaml @@ -0,0 +1,49 @@ +# Translation Agent Configuration +translation_agent: + # Backend Configuration + backend: + type: "siliconflow" # Options: "ollama" or "siliconflow" + # siliconflow配置现在通过命令行参数传入 + ollama: + base_url: "http://localhost:11434" + + # Model Configuration + model: + name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others + temperature: 0.1 + max_retry: 5 # For siliconflow backend + max_retry_ollama: 1 # For ollama backend + + # Processing Configuration + processing: + max_workers: 8 # Number of parallel workers for file processing + single_file_timeout: 180 # Timeout for single file summary generation (seconds) + total_summary_timeout: 300 # Timeout for total summary generation (seconds) + + # Logging Configuration + logging: + level: "INFO" + +# Issue Creation Configuration +orgs: + - org_name: openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] + + - org_name: src-openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] \ No newline at end of file -- Gitee From 8177646f2d47ce618c6ede807275ec200565555b Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Fri, 26 Sep 2025 01:54:24 +0800 Subject: [PATCH 5/8] update for review --- .../new_create_translation_issue_AI.py | 291 ++++++++++++------ ci/tools/translation/translation.yaml | 7 +- ci/tools/translation/translation_agent.py | 197 ++++++------ 3 files changed, 298 insertions(+), 197 deletions(-) diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py index 0a61c7136..272b70711 100755 --- a/ci/tools/translation/new_create_translation_issue_AI.py +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -1,3 +1,10 @@ +# ==================== 常量定义 ==================== + +# Issue类型常量 +ISSUE_TYPE_TRANSLATION = "翻译" + +# ==================== 数据模型定义 ==================== + import argparse import json import logging @@ -156,7 +163,7 @@ class GiteeClient: req_body = { "repo": repo, "title": title, - "issue_type": "翻译", + "issue_type": ISSUE_TYPE_TRANSLATION, "body": body, "assignee": assignee, "push_events": False, @@ -263,112 +270,192 @@ def load_config_yaml(yaml_path): return Config(**data) +def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], issue_title_pr_mark: str) -> tuple[int, list[str], list[str], dict]: + """ + 分析diff文件,识别需要创建issue的文件 + 返回: (文件计数, 中文文件列表, 英文文件列表, 需要创建的issue字典) + """ + file_count = 0 + zh_file = [] + en_file = [] + need_create_issue = {} + + for trigger in issue_triggers: + for diff_file in diff_files: + if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: + logger.info("file {} has been changed".format(diff_file)) + file_count += 1 + if "/zh" in trigger.trigger_pr_path: + need_create_issue["zh"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + zh_file.append(diff_file.replace("zh/", "")) + elif "/en" in trigger.trigger_pr_path: + need_create_issue["en"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + en_file.append(diff_file.replace("en/", "")) + else: + logger.warning("not a range") + + return file_count, zh_file, en_file, need_create_issue + + +def check_same_files_changed(zh_file: list[str], en_file: list[str]) -> bool: + """ + 检查中英文路径下是否修改了相同的文件 + """ + for z in zh_file: + if z in en_file: + return True + return False + + +def prepare_issue_templates(need_create_issue: dict) -> tuple[dict, list[str]]: + """ + 准备issue模板和标题列表 + """ + need_create_issue_template = {} + need_create_issue_titles = [] + for issue_item in need_create_issue: + need_create_issue_titles.append(need_create_issue[issue_item][1]) + need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] + return need_create_issue_template, need_create_issue_titles + + +def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) -> str: + """ + 生成issue的正文内容 + """ + issue_body = "" + if issue_summary and not issue_summary.error: + issue_body += f"## 📊 变更统计\n\n" + issue_body += f"- **总文件数**: {issue_summary.total_files}\n" + issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" + if issue_summary.total_files != issue_summary.processed_files: + # 注意人工审查提醒 + issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" + issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" + if issue_summary.total_summary: + total = issue_summary.total_summary + issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" + issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" + issue_body += f"## 🔍 整体变更摘要\n\n" + issue_body += f"{total.overall_summary}\n\n" + issue_body += f"## ⚠️ 整体潜在影响\n\n" + issue_body += f"{total.overall_potential_impact}\n\n" + if issue_summary.file_summaries: + issue_body += f"## 📝 单文件变更详情\n\n" + for summary in issue_summary.file_summaries: + issue_body += f"### 📁 {summary.file_path}\n\n" + issue_body += f"- **改动类型**: {summary.change_type}\n" + issue_body += f"- **新增行数**: {summary.lines_added}\n" + issue_body += f"- **删除行数**: {summary.lines_deleted}\n" + issue_body += f"- **潜在影响**: {summary.potential_impact}\n" + issue_body += f"- **详细摘要**: {summary.summary}\n\n" + issue_body += "---\n\n" + else: + issue_body += f"## ⚠️ 翻译变更检测\n\n" + issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" + issue_body += f"**变更文件数量**: {len(diff_files)}\n" + issue_body += f"**相关PR**: {pr_html_url}\n\n" + + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" + issue_body += f"## 🔗 相关PR链接\n\n" + issue_body += f"- {pr_html_url}\n" + + return issue_body + + +def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: str, pr_number: int, + siliconflow_api_key: str, siliconflow_api_base: str, pr_html_url: str, issue_title_pr_mark: str, + translation_agent_config: TranslationAgentConfig = None): + """ + 处理单个组织配置项 + """ + # 获取diff内容 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + sys.exit(1) + + diff_files = get_diff_file_list(diff_content) + + # 分析diff文件 + file_count, zh_file, en_file, need_create_issue = analyze_diff_files( + diff_files, org_item.issue_triggers, issue_title_pr_mark) + + # 检查是否修改了相同文件 + changed_same_files = check_same_files_changed(zh_file, en_file) + + # 验证是否需要创建issue + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) + return + + if changed_same_files: + logger.info("changed the same files in en and zh path, no need to create issue") + return + + # 准备issue模板 + need_create_issue_template, need_create_issue_titles = prepare_issue_templates(need_create_issue) + + if not need_create_issue_titles: + return + + # 检查issue是否已存在 + need_create_issue_list, existed_issue_list = cli.check_issue_exists( + org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) + + if not need_create_issue_list: + feedback_comment = "issue has already created, please go to check issue: {}".format(existed_issue_list) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + return + + # 创建issue + for need_create_issue_item in need_create_issue_list: + # 从配置中提取参数 + backend_config = translation_agent_config.backend if translation_agent_config else {} + model_config = translation_agent_config.model if translation_agent_config else {} + processing_config = translation_agent_config.processing if translation_agent_config else {} + + # 提取具体配置值 + backend_type = backend_config.get('type', 'siliconflow') + model_name = model_config.get('name', 'Qwen/Qwen3-8B') + temperature = model_config.get('temperature', 0.1) + max_workers = processing_config.get('max_workers', 8) + single_file_timeout = processing_config.get('single_file_timeout', 180) + total_summary_timeout = processing_config.get('total_summary_timeout', 300) + max_retry = model_config.get('max_retry', 5) + max_retry_ollama = model_config.get('max_retry_ollama', 1) + + issue_summary = get_agent_summary( + diff_content, siliconflow_api_key, siliconflow_api_base, + model_name=model_name, backend_type=backend_type, temperature=temperature, + max_workers=max_workers, single_file_timeout=single_file_timeout, + total_summary_timeout=total_summary_timeout, max_retry=max_retry, + max_retry_ollama=max_retry_ollama + ) + issue_body = generate_issue_body(issue_summary, diff_files, pr_html_url) + + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, + need_create_issue_template[need_create_issue_item], issue_body) + + def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): - pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + """ + 基于PR diff和配置创建issue的主函数 + """ + pr_html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + for org_item in conf.orgs: - issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) if org_item.org_name != pr_owner: continue - # 旧标点符号判断逻辑,已弃用 - # if org_item.auto_create_issue: - # cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) - file_count = 0 - diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) - if diff_content is None: - sys.exit(1) - diff_files = get_diff_file_list(diff_content) - zh_file = [] - en_file = [] - need_create_issue = {} - for trigger in org_item.issue_triggers: - for diff_file in diff_files: - if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: - logger.info("file {} has been changed".format(diff_file)) - file_count += 1 - if "/zh" in trigger.trigger_pr_path: - need_create_issue["zh"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - zh_file.append(diff_file.replace("zh/", "")) - elif "/en" in trigger.trigger_pr_path: - need_create_issue["en"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - en_file.append(diff_file.replace("en/", "")) - else: - logger.warning("not a range") - changed_same_files = False - for z in zh_file: - if z in en_file: - changed_same_files = True - else: - changed_same_files = False - if file_count == 0: - logger.warning( - "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) - return - if changed_same_files: - logger.info("changed the same files in en and zh path, no need to create issue") - return - - need_create_issue_template = {} - need_create_issue_titles = [] - for issue_item in need_create_issue: - need_create_issue_titles.append(need_create_issue[issue_item][1]) - need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] - if need_create_issue_titles: - - need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, - org_item.issue_of_repo, - need_create_issue_titles) - - if not need_create_issue_list: - feedback_comment = "issue has already created, please go to check issue: {}".format( - existed_issue_list) - logger.info("Warning: " + feedback_comment) - cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) - for need_create_issue_item in need_create_issue_list: - - issue_summary = get_agent_summary(diff_content, siliconflow_api_key, siliconflow_api_base) - issue_body = "" - if issue_summary and not issue_summary.error: - issue_body += f"## 📊 变更统计\n\n" - issue_body += f"- **总文件数**: {issue_summary.total_files}\n" - issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" - if issue_summary.total_files != issue_summary.processed_files: - # 注意人工审查提醒 - issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" - issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" - if issue_summary.total_summary: - total = issue_summary.total_summary - issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" - issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" - issue_body += f"## 🔍 整体变更摘要\n\n" - issue_body += f"{total.overall_summary}\n\n" - issue_body += f"## ⚠️ 整体潜在影响\n\n" - issue_body += f"{total.overall_potential_impact}\n\n" - if issue_summary.file_summaries: - issue_body += f"## 📝 单文件变更详情\n\n" - for summary in issue_summary.file_summaries: - issue_body += f"### 📁 {summary.file_path}\n\n" - issue_body += f"- **改动类型**: {summary.change_type}\n" - issue_body += f"- **新增行数**: {summary.lines_added}\n" - issue_body += f"- **删除行数**: {summary.lines_deleted}\n" - issue_body += f"- **潜在影响**: {summary.potential_impact}\n" - issue_body += f"- **详细摘要**: {summary.summary}\n\n" - issue_body += "---\n\n" - else: - issue_body += f"## ⚠️ 翻译变更检测\n\n" - issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" - issue_body += f"**变更文件数量**: {len(diff_files)}\n" - issue_body += f"**相关PR**: {pr__html_url}\n\n" - - issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" - issue_body += f"## 🔗 相关PR链接\n\n" - issue_body += f"- {pr__html_url}\n" - - cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, - need_create_issue_template[need_create_issue_item], - issue_body) + + process_org_item(org_item, cli, pr_owner, pr_repo, pr_number, + siliconflow_api_key, siliconflow_api_base, pr_html_url, issue_title_pr_mark, + conf.translation_agent) diff --git a/ci/tools/translation/translation.yaml b/ci/tools/translation/translation.yaml index 2dfccd235..6842cfac9 100644 --- a/ci/tools/translation/translation.yaml +++ b/ci/tools/translation/translation.yaml @@ -8,14 +8,9 @@ repositories: assign_issue: - title: "[Auto] This is an English translation issue." - sign_to: judithsq -# - trigger_pr_path: 'docs/en' -# file_extension: [doc, md, json] -# assign_issue: -# - title: "[Auto] This is a Russian translation issue." -# - sign_to: judithsq exclude: - condition: only_marks_change - check_list: [',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、'] + check_list: [',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、'] - owner: openeuler repo: website-v2 auto_create_issue: false diff --git a/ci/tools/translation/translation_agent.py b/ci/tools/translation/translation_agent.py index 258826eb5..60e4aac60 100755 --- a/ci/tools/translation/translation_agent.py +++ b/ci/tools/translation/translation_agent.py @@ -20,41 +20,26 @@ from langchain_core.output_parsers import JsonOutputParser from langchain_openai import ChatOpenAI import yaml -# ==================== 配置加载 ==================== - -def load_config(config_file="new_create_translation_issue.yaml"): - """从YAML文件加载配置""" - try: - with open(config_file, 'r', encoding='utf-8') as f: - config = yaml.safe_load(f) - return config.get('translation_agent', {}) - except FileNotFoundError: - print(f"配置文件 {config_file} 不存在") - raise - except yaml.YAMLError as e: - print(f"解析配置文件时发生错误: {e}") - raise - -# 加载配置 -_config = load_config() - # ==================== 配置常量 ==================== -BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') -OLLAMA_BASE_URL = _config.get('backend', {}).get('ollama', {}).get('base_url', 'http://localhost:11434') -MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') -MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) -MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) -MODEL_MAX_RETRY_OLLAMA = _config.get('model', {}).get('max_retry_ollama', 1) -PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) -SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) -TOTAL_SUMMARY_TIMEOUT = _config.get('processing', {}).get('total_summary_timeout', 300) -LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') -SILICONFLOW_API_KEY = '' -SILICONFLOW_API_BASE ='' +# 后端类型常量 +BACKEND_TYPE_OLLAMA = "ollama" +BACKEND_TYPE_SILICONFLOW = "siliconflow" + +# 默认配置值 +DEFAULT_BACKEND_TYPE = BACKEND_TYPE_SILICONFLOW +DEFAULT_OLLAMA_BASE_URL = 'http://localhost:11434' +DEFAULT_MODEL_NAME = 'Qwen/Qwen3-8B' +DEFAULT_MODEL_TEMPERATURE = 0.1 +DEFAULT_MODEL_MAX_RETRY = 5 +DEFAULT_MODEL_MAX_RETRY_OLLAMA = 1 +DEFAULT_PROCESSING_MAX_WORKERS = 8 +DEFAULT_SINGLE_FILE_TIMEOUT = 180 +DEFAULT_TOTAL_SUMMARY_TIMEOUT = 300 +DEFAULT_LOGGING_LEVEL = 'INFO' # 配置日志 -logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) +logging.basicConfig(level=getattr(logging, DEFAULT_LOGGING_LEVEL.upper())) logger = logging.getLogger(__name__) # ==================== 数据模型定义 ==================== @@ -103,7 +88,7 @@ class ProcessingResult: # ==================== Token 统计工具 ==================== class TokenCounter: - def __init__(self, model_name=MODEL_NAME): + def __init__(self, model_name=DEFAULT_MODEL_NAME): self.model_name = model_name self.prompt_tokens = 0 self.completion_tokens = 0 @@ -432,52 +417,62 @@ class LLMFactory: """LLM工厂类""" @staticmethod - def create_chat_llm(model_name: str = None, base_url: str = None): + def create_chat_llm(model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, siliconflow_api_key: str = "", siliconflow_api_base: str = ""): """创建LLM实例""" if model_name is None: - model_name = MODEL_NAME + model_name = DEFAULT_MODEL_NAME if base_url is None: - base_url = OLLAMA_BASE_URL + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE - if BACKEND_TYPE == "ollama": + if backend_type == BACKEND_TYPE_OLLAMA: return ChatOllama( model=model_name, base_url=base_url, - temperature=MODEL_TEMPERATURE + temperature=temperature ) - elif BACKEND_TYPE == "siliconflow": + elif backend_type == BACKEND_TYPE_SILICONFLOW: return ChatOpenAI( model=model_name, - api_key=SecretStr(SILICONFLOW_API_KEY), - base_url=SILICONFLOW_API_BASE, - temperature=MODEL_TEMPERATURE + api_key=SecretStr(siliconflow_api_key), + base_url=siliconflow_api_base, + temperature=temperature ) else: - raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + raise ValueError(f"不支持的后端类型: {backend_type}") @staticmethod - def create_llm(model_name: str = None, base_url: str = None): + def create_llm(model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, siliconflow_api_key: str = "", siliconflow_api_base: str = ""): """创建LLM实例""" if model_name is None: - model_name = MODEL_NAME + model_name = DEFAULT_MODEL_NAME if base_url is None: - base_url = OLLAMA_BASE_URL + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE - if BACKEND_TYPE == "ollama": + if backend_type == BACKEND_TYPE_OLLAMA: return Ollama( model=model_name, base_url=base_url, - temperature=MODEL_TEMPERATURE + temperature=temperature ) - elif BACKEND_TYPE == "siliconflow": + elif backend_type == BACKEND_TYPE_SILICONFLOW: return ChatOpenAI( model=model_name, - api_key=SecretStr(SILICONFLOW_API_KEY), - base_url=SILICONFLOW_API_BASE, - temperature=MODEL_TEMPERATURE + api_key=SecretStr(siliconflow_api_key), + base_url=siliconflow_api_base, + temperature=temperature ) else: - raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + raise ValueError(f"不支持的后端类型: {backend_type}") class PromptTemplates: """提示模板集合""" @@ -669,15 +664,16 @@ Git Diff 内容: class SingleFileAnalysisChain: """单文件分析任务链""" - def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter): + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, backend_type: str = DEFAULT_BACKEND_TYPE): self.llm = llm self.token_counter = token_counter + self.backend_type = backend_type # 创建输出解析器 self.output_parser = JsonOutputParser(pydantic_object=SingleFileSummary) # 根据后端类型选择不同的链构建方式 - if BACKEND_TYPE == "ollama": + if backend_type == BACKEND_TYPE_OLLAMA: self.prompt = PromptTemplates.get_single_file_prompt() self.chain = self.prompt | self.llm.with_structured_output(SingleFileSummary) else: @@ -809,10 +805,11 @@ Git Diff 内容: ]) self.chain = self.prompt | self.llm | self.output_parser - def analyze(self, diff_file_info: DiffFileInfo) -> Optional[SingleFileSummary]: + def analyze(self, diff_file_info: DiffFileInfo, max_retry_ollama: int = DEFAULT_MODEL_MAX_RETRY_OLLAMA, + max_retry: int = DEFAULT_MODEL_MAX_RETRY) -> Optional[SingleFileSummary]: """分析单个文件的改动""" - max_retry = MODEL_MAX_RETRY_OLLAMA if BACKEND_TYPE == "ollama" else MODEL_MAX_RETRY - for attempt in range(1, max_retry + 1): + max_retry_count = max_retry_ollama if self.backend_type == BACKEND_TYPE_OLLAMA else max_retry + for attempt in range(1, max_retry_count + 1): # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 if attempt > 1: delay = min(attempt * 2, 10) # 递增延迟,最多10秒 @@ -843,7 +840,7 @@ Git Diff 内容: "lines_added": diff_file_info.lines_added, "lines_deleted": diff_file_info.lines_deleted } - if BACKEND_TYPE != "ollama": + if self.backend_type != BACKEND_TYPE_OLLAMA: invoke_args["response_format"] = {"type": "json_object"} result = self.chain.invoke(invoke_args) @@ -870,7 +867,7 @@ Git Diff 内容: # 结果无效,记录并重试 logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") - if attempt < max_retry: + if attempt < max_retry_count: continue except Exception as e: err_str = str(e) @@ -884,29 +881,30 @@ Git Diff 内容: is_http_error = True if is_http_error: logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") - if attempt < max_retry: + if attempt < max_retry_count: time.sleep(10) continue else: logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") # 其它异常直接进入下一次重试 - if attempt < max_retry: + if attempt < max_retry_count: logger.info(f"第{attempt}次尝试失败,准备重试...") - logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry_count}次均未获得结构化输出,放弃。") return None class TotalSummaryChain: """总摘要生成任务链""" - def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter): + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, backend_type: str = DEFAULT_BACKEND_TYPE): self.llm = llm self.token_counter = token_counter + self.backend_type = backend_type # 创建输出解析器 self.output_parser = JsonOutputParser(pydantic_object=TotalSummary) # 根据后端类型选择不同的链构建方式 - if BACKEND_TYPE == "ollama": + if backend_type == BACKEND_TYPE_OLLAMA: self.prompt = PromptTemplates.get_total_summary_prompt() self.chain = self.prompt | self.llm.with_structured_output(TotalSummary) else: @@ -985,7 +983,7 @@ class TotalSummaryChain: ]) self.chain = self.prompt | self.llm | self.output_parser - def generate(self, file_summaries: List[SingleFileSummary]) -> Optional[TotalSummary]: + def generate(self, file_summaries: List[SingleFileSummary], total_summary_timeout: int = DEFAULT_TOTAL_SUMMARY_TIMEOUT) -> Optional[TotalSummary]: """生成总摘要""" try: total_files = len(file_summaries) @@ -1027,16 +1025,16 @@ class TotalSummaryChain: "total_files": total_files, "total_lines": total_lines } - if BACKEND_TYPE != "ollama": + if self.backend_type != BACKEND_TYPE_OLLAMA: # 为 SiliconFlow 添加 response_format 参数 invoke_args["response_format"] = {"type": "json_object"} # 提交任务并设置超时 future = timeout_executor.submit(self.chain.invoke, invoke_args) try: - result = future.result(timeout=TOTAL_SUMMARY_TIMEOUT) + result = future.result(timeout=total_summary_timeout) except (FutureTimeoutError, TimeoutError) as e: - logger.error(f"生成总摘要超时({TOTAL_SUMMARY_TIMEOUT}秒),放弃生成总摘要: {type(e).__name__}") + logger.error(f"生成总摘要超时({total_summary_timeout}秒),放弃生成总摘要: {type(e).__name__}") try: future.cancel() # 尝试取消超时的任务 except Exception as cancel_e: @@ -1091,23 +1089,40 @@ class TotalSummaryChain: class GitDiffSummarizer: """Git Diff 摘要生成器""" - def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): + def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", + model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, max_workers: int = None, single_file_timeout: int = None, + total_summary_timeout: int = None, max_retry: int = None, max_retry_ollama: int = None): if model_name is None: - model_name = MODEL_NAME + model_name = DEFAULT_MODEL_NAME if base_url is None: - base_url = OLLAMA_BASE_URL - - # 设置siliconflow API配置 - global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE - if siliconflow_api_key: - SILICONFLOW_API_KEY = siliconflow_api_key - if siliconflow_api_base: - SILICONFLOW_API_BASE = siliconflow_api_base + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE + if max_workers is None: + max_workers = DEFAULT_PROCESSING_MAX_WORKERS + if single_file_timeout is None: + single_file_timeout = DEFAULT_SINGLE_FILE_TIMEOUT + if total_summary_timeout is None: + total_summary_timeout = DEFAULT_TOTAL_SUMMARY_TIMEOUT + if max_retry is None: + max_retry = DEFAULT_MODEL_MAX_RETRY + if max_retry_ollama is None: + max_retry_ollama = DEFAULT_MODEL_MAX_RETRY_OLLAMA + + self.backend_type = backend_type + self.max_workers = max_workers + self.single_file_timeout = single_file_timeout + self.total_summary_timeout = total_summary_timeout + self.max_retry = max_retry + self.max_retry_ollama = max_retry_ollama self.token_counter = TokenCounter(model_name) - self.llm = LLMFactory.create_chat_llm(model_name, base_url) - self.single_file_chain = SingleFileAnalysisChain(self.llm, self.token_counter) - self.total_summary_chain = TotalSummaryChain(self.llm, self.token_counter) + self.llm = LLMFactory.create_chat_llm(model_name, base_url, backend_type, temperature, siliconflow_api_key, siliconflow_api_base) + self.single_file_chain = SingleFileAnalysisChain(self.llm, self.token_counter, backend_type) + self.total_summary_chain = TotalSummaryChain(self.llm, self.token_counter, backend_type) def cleanup(self): """清理资源,确保程序能正确退出""" @@ -1119,7 +1134,7 @@ class GitDiffSummarizer: self.llm._client.close() # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 - if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): + if self.backend_type == BACKEND_TYPE_SILICONFLOW and hasattr(self.llm, 'client'): try: # 强制关闭 httpx 客户端 if hasattr(self.llm.client, '_client'): @@ -1133,7 +1148,7 @@ class GitDiffSummarizer: def process_git_diff(self, diff_content: str, max_workers: int = None) -> ProcessingResult: if max_workers is None: - max_workers = PROCESSING_MAX_WORKERS + max_workers = self.max_workers logger.info("开始解析git diff...") files = DiffParser.parse_git_diff(diff_content) @@ -1154,12 +1169,12 @@ class GitDiffSummarizer: try: executor = ThreadPoolExecutor(max_workers=max_workers) future_to_file = { - executor.submit(self.single_file_chain.analyze, file_info): file_info.file_path + executor.submit(self.single_file_chain.analyze, file_info, self.max_retry_ollama, self.max_retry): file_info.file_path for file_info in files } # 设置更长的整体超时时间,避免与单个文件超时冲突 - overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 + overall_timeout = self.single_file_timeout * len(files) + 600 # 给每个文件的时间 + 额外缓冲 completed_count = 0 total_count = len(future_to_file) @@ -1205,7 +1220,7 @@ class GitDiffSummarizer: if file_summaries: logger.info(f"基于 {len(file_summaries)} 个成功处理的文件生成总摘要...") try: - total_summary = self.total_summary_chain.generate(file_summaries) + total_summary = self.total_summary_chain.generate(file_summaries, self.total_summary_timeout) if total_summary: logger.info("总摘要生成成功") else: @@ -1223,9 +1238,14 @@ class GitDiffSummarizer: # ==================== 主函数 ==================== -def get_agent_summary(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): +def get_agent_summary(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1", + model_name=None, base_url=None, backend_type=None, temperature=None, + max_workers=None, single_file_timeout=None, total_summary_timeout=None, + max_retry=None, max_retry_ollama=None): - summarizer = GitDiffSummarizer(siliconflow_api_key, siliconflow_api_base) + summarizer = GitDiffSummarizer(siliconflow_api_key, siliconflow_api_base, model_name, base_url, + backend_type, temperature, max_workers, single_file_timeout, + total_summary_timeout, max_retry, max_retry_ollama) result = None try: result = summarizer.process_git_diff(sample_diff) @@ -1269,7 +1289,6 @@ def get_agent_summary(sample_diff, siliconflow_api_key="", siliconflow_api_base= print(f"Prompt tokens: {stats['prompt_tokens']}") print(f"Completion tokens: {stats['completion_tokens']}") print(f"Total tokens: {stats['total_tokens']}") - # exit() return result if __name__ == "__main__": -- Gitee From 77745d3a03bef79d0f944b3a1b31e880dc78a05f Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Fri, 26 Sep 2025 02:03:38 +0800 Subject: [PATCH 6/8] update for review --- .../new_create_translation_issue_AI.py | 83 ++++++++++++------- .../new_create_translation_issue_AI.yaml | 6 +- ci/tools/translation/translation_agent.py | 62 +++++++++----- 3 files changed, 98 insertions(+), 53 deletions(-) diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py index 272b70711..4e0a52e3b 100755 --- a/ci/tools/translation/new_create_translation_issue_AI.py +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -132,7 +132,8 @@ class GiteeClient: logger.error("can not get diff file from PR: {}".format(req_url)) return result - def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: + def check_issue_exists(self, owner: str, repo: str, + issue_titles: list[str]) -> tuple[list[str], list[str]]: req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) page = 1 existed_issues = [] @@ -270,7 +271,8 @@ def load_config_yaml(yaml_path): return Config(**data) -def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], issue_title_pr_mark: str) -> tuple[int, list[str], list[str], dict]: +def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], + issue_title_pr_mark: str) -> tuple[int, list[str], list[str], dict]: """ 分析diff文件,识别需要创建issue的文件 返回: (文件计数, 中文文件列表, 英文文件列表, 需要创建的issue字典) @@ -282,16 +284,21 @@ def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger] for trigger in issue_triggers: for diff_file in diff_files: - if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: + if diff_file.startswith(trigger.trigger_pr_path) and \ + diff_file.split('.')[-1] in trigger.file_extension: logger.info("file {} has been changed".format(diff_file)) file_count += 1 if "/zh" in trigger.trigger_pr_path: - need_create_issue["zh"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + need_create_issue["zh"] = [ + trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark) + ] zh_file.append(diff_file.replace("zh/", "")) elif "/en" in trigger.trigger_pr_path: - need_create_issue["en"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + need_create_issue["en"] = [ + trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark) + ] en_file.append(diff_file.replace("en/", "")) else: logger.warning("not a range") @@ -317,7 +324,8 @@ def prepare_issue_templates(need_create_issue: dict) -> tuple[dict, list[str]]: need_create_issue_titles = [] for issue_item in need_create_issue: need_create_issue_titles.append(need_create_issue[issue_item][1]) - need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] + need_create_issue_template[need_create_issue[issue_item][1]] = \ + need_create_issue[issue_item][0] return need_create_issue_template, need_create_issue_titles @@ -333,7 +341,8 @@ def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) if issue_summary.total_files != issue_summary.processed_files: # 注意人工审查提醒 issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" - issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要,请注意人工审查!**\n" + issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要," \ + f"请注意人工审查!**\n" if issue_summary.total_summary: total = issue_summary.total_summary issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" @@ -358,15 +367,17 @@ def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) issue_body += f"**变更文件数量**: {len(diff_files)}\n" issue_body += f"**相关PR**: {pr_html_url}\n\n" - issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成,仅供参考,请以实际更改为准。\n\n" + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成," \ + f"仅供参考,请以实际更改为准。\n\n" issue_body += f"## 🔗 相关PR链接\n\n" issue_body += f"- {pr_html_url}\n" return issue_body -def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: str, pr_number: int, - siliconflow_api_key: str, siliconflow_api_base: str, pr_html_url: str, issue_title_pr_mark: str, +def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str, + pr_html_url: str, issue_title_pr_mark: str, translation_agent_config: TranslationAgentConfig = None): """ 处理单个组织配置项 @@ -388,7 +399,8 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st # 验证是否需要创建issue if file_count == 0: logger.warning( - "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) + "NOTE: https://gitee.com/{}/files change files out of translate range" + .format(issue_title_pr_mark)) return if changed_same_files: @@ -406,7 +418,8 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) if not need_create_issue_list: - feedback_comment = "issue has already created, please go to check issue: {}".format(existed_issue_list) + feedback_comment = "issue has already created, please go to check issue: {}".format( + existed_issue_list) logger.info("Warning: " + feedback_comment) cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) return @@ -428,21 +441,30 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st max_retry = model_config.get('max_retry', 5) max_retry_ollama = model_config.get('max_retry_ollama', 1) - issue_summary = get_agent_summary( - diff_content, siliconflow_api_key, siliconflow_api_base, - model_name=model_name, backend_type=backend_type, temperature=temperature, - max_workers=max_workers, single_file_timeout=single_file_timeout, - total_summary_timeout=total_summary_timeout, max_retry=max_retry, - max_retry_ollama=max_retry_ollama - ) - issue_body = generate_issue_body(issue_summary, diff_files, pr_html_url) + try: + issue_summary = get_agent_summary( + diff_content, siliconflow_api_key, siliconflow_api_base, + model_name=model_name, backend_type=backend_type, temperature=temperature, + max_workers=max_workers, single_file_timeout=single_file_timeout, + total_summary_timeout=total_summary_timeout, max_retry=max_retry, + max_retry_ollama=max_retry_ollama + ) + issue_body = generate_issue_body(issue_summary, diff_files, pr_html_url) + except Exception as e: + logger.error(f"AI Agent调用失败: {e}") + logger.info("回退到传统方式创建issue") + # 使用传统方式的简单issue body格式 + issue_body = "### Related PR link \n - {}".format(pr_html_url) - cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, + need_create_issue_item, need_create_issue_template[need_create_issue_item], issue_body) -def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, - pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): +def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, + pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, + siliconflow_api_base: str): """ 基于PR diff和配置创建issue的主函数 """ @@ -454,8 +476,8 @@ def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_ continue process_org_item(org_item, cli, pr_owner, pr_repo, pr_number, - siliconflow_api_key, siliconflow_api_base, pr_html_url, issue_title_pr_mark, - conf.translation_agent) + siliconflow_api_key, siliconflow_api_base, pr_html_url, + issue_title_pr_mark, conf.translation_agent) @@ -466,7 +488,9 @@ def main(): parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') parser.add_argument('--pr_number', type=str, required=True, help='the PR number') parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') - parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, + default="https://api.siliconflow.cn/v1", + help='the base URL of siliconflow') args = Args() parser.parse_args(args=sys.argv[1:], namespace=args) args.validate() @@ -482,7 +506,8 @@ def main(): pr_number = args.pr_number siliconflow_api_key = args.siliconflow_api_key siliconflow_api_base = args.siliconflow_api_base - create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, + siliconflow_api_key, siliconflow_api_base) if __name__ == '__main__': diff --git a/ci/tools/translation/new_create_translation_issue_AI.yaml b/ci/tools/translation/new_create_translation_issue_AI.yaml index bc48ab7a2..4455c5750 100755 --- a/ci/tools/translation/new_create_translation_issue_AI.yaml +++ b/ci/tools/translation/new_create_translation_issue_AI.yaml @@ -9,7 +9,7 @@ translation_agent: # Model Configuration model: - name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others + name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others temperature: 0.1 max_retry: 5 # For siliconflow backend max_retry_ollama: 1 # For ollama backend @@ -35,7 +35,7 @@ orgs: issue_title: "[Auto] This is an English translation issue for the PR" issue_assignee: judithsq file_extension: [ doc, md, json ] - change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、' ] - org_name: src-openeuler issue_of_owner: openeuler @@ -46,4 +46,4 @@ orgs: issue_title: "[Auto] This is an English translation issue for the PR" issue_assignee: judithsq file_extension: [ doc, md, json ] - change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] \ No newline at end of file + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、' ] \ No newline at end of file diff --git a/ci/tools/translation/translation_agent.py b/ci/tools/translation/translation_agent.py index 60e4aac60..dbe89469d 100755 --- a/ci/tools/translation/translation_agent.py +++ b/ci/tools/translation/translation_agent.py @@ -47,7 +47,8 @@ logger = logging.getLogger(__name__) class SingleFileSummary(BaseModel): """单个文件摘要的结构化输出""" file_path: str = Field(description="文件路径", default="") - change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", + "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") potential_impact: str = Field(description="改动对其他文件潜在的影响") summary: str = Field(description="改动的详细摘要") lines_added: int = Field(description="新增行数", default=0) @@ -56,7 +57,8 @@ class SingleFileSummary(BaseModel): class FileChangeInfo(BaseModel): """文件改动信息""" file_path: str = Field(description="文件路径") - change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", + "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") lines_changed: int = Field(description="改动行数") class TotalSummary(BaseModel): @@ -418,7 +420,8 @@ class LLMFactory: @staticmethod def create_chat_llm(model_name: str = None, base_url: str = None, backend_type: str = None, - temperature: float = None, siliconflow_api_key: str = "", siliconflow_api_base: str = ""): + temperature: float = None, siliconflow_api_key: str = "", + siliconflow_api_base: str = ""): """创建LLM实例""" if model_name is None: model_name = DEFAULT_MODEL_NAME @@ -447,7 +450,8 @@ class LLMFactory: @staticmethod def create_llm(model_name: str = None, base_url: str = None, backend_type: str = None, - temperature: float = None, siliconflow_api_key: str = "", siliconflow_api_base: str = ""): + temperature: float = None, siliconflow_api_key: str = "", + siliconflow_api_base: str = ""): """创建LLM实例""" if model_name is None: model_name = DEFAULT_MODEL_NAME @@ -664,7 +668,8 @@ Git Diff 内容: class SingleFileAnalysisChain: """单文件分析任务链""" - def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, backend_type: str = DEFAULT_BACKEND_TYPE): + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, + backend_type: str = DEFAULT_BACKEND_TYPE): self.llm = llm self.token_counter = token_counter self.backend_type = backend_type @@ -805,7 +810,8 @@ Git Diff 内容: ]) self.chain = self.prompt | self.llm | self.output_parser - def analyze(self, diff_file_info: DiffFileInfo, max_retry_ollama: int = DEFAULT_MODEL_MAX_RETRY_OLLAMA, + def analyze(self, diff_file_info: DiffFileInfo, + max_retry_ollama: int = DEFAULT_MODEL_MAX_RETRY_OLLAMA, max_retry: int = DEFAULT_MODEL_MAX_RETRY) -> Optional[SingleFileSummary]: """分析单个文件的改动""" max_retry_count = max_retry_ollama if self.backend_type == BACKEND_TYPE_OLLAMA else max_retry @@ -877,10 +883,12 @@ Git Diff 内容: if code in err_str: is_http_error = True break - if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): + if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and \ + any(code in err_str for code in ["404", "500", "502", "503", "504"]): is_http_error = True if is_http_error: - logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") + logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e}," + f"第{attempt}次尝试,10秒后重试...") if attempt < max_retry_count: time.sleep(10) continue @@ -889,13 +897,15 @@ Git Diff 内容: # 其它异常直接进入下一次重试 if attempt < max_retry_count: logger.info(f"第{attempt}次尝试失败,准备重试...") - logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry_count}次均未获得结构化输出,放弃。") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry_count}次均未获得结构化输出," + f"放弃。") return None class TotalSummaryChain: """总摘要生成任务链""" - def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, backend_type: str = DEFAULT_BACKEND_TYPE): + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, + backend_type: str = DEFAULT_BACKEND_TYPE): self.llm = llm self.token_counter = token_counter self.backend_type = backend_type @@ -983,7 +993,8 @@ class TotalSummaryChain: ]) self.chain = self.prompt | self.llm | self.output_parser - def generate(self, file_summaries: List[SingleFileSummary], total_summary_timeout: int = DEFAULT_TOTAL_SUMMARY_TIMEOUT) -> Optional[TotalSummary]: + def generate(self, file_summaries: List[SingleFileSummary], + total_summary_timeout: int = DEFAULT_TOTAL_SUMMARY_TIMEOUT) -> Optional[TotalSummary]: """生成总摘要""" try: total_files = len(file_summaries) @@ -1034,7 +1045,8 @@ class TotalSummaryChain: try: result = future.result(timeout=total_summary_timeout) except (FutureTimeoutError, TimeoutError) as e: - logger.error(f"生成总摘要超时({total_summary_timeout}秒),放弃生成总摘要: {type(e).__name__}") + logger.error(f"生成总摘要超时({total_summary_timeout}秒),放弃生成总摘要: " + f"{type(e).__name__}") try: future.cancel() # 尝试取消超时的任务 except Exception as cancel_e: @@ -1089,10 +1101,12 @@ class TotalSummaryChain: class GitDiffSummarizer: """Git Diff 摘要生成器""" - def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", + def __init__(self, siliconflow_api_key: str = "", + siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None, backend_type: str = None, - temperature: float = None, max_workers: int = None, single_file_timeout: int = None, - total_summary_timeout: int = None, max_retry: int = None, max_retry_ollama: int = None): + temperature: float = None, max_workers: int = None, + single_file_timeout: int = None, total_summary_timeout: int = None, + max_retry: int = None, max_retry_ollama: int = None): if model_name is None: model_name = DEFAULT_MODEL_NAME if base_url is None: @@ -1120,7 +1134,8 @@ class GitDiffSummarizer: self.max_retry_ollama = max_retry_ollama self.token_counter = TokenCounter(model_name) - self.llm = LLMFactory.create_chat_llm(model_name, base_url, backend_type, temperature, siliconflow_api_key, siliconflow_api_base) + self.llm = LLMFactory.create_chat_llm(model_name, base_url, backend_type, temperature, + siliconflow_api_key, siliconflow_api_base) self.single_file_chain = SingleFileAnalysisChain(self.llm, self.token_counter, backend_type) self.total_summary_chain = TotalSummaryChain(self.llm, self.token_counter, backend_type) @@ -1187,11 +1202,14 @@ class GitDiffSummarizer: summary = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 if summary: file_summaries.append(summary) - logger.info(f"完成文件 {file_path} 的摘要生成 ({completed_count}/{total_count})") + logger.info(f"完成文件 {file_path} 的摘要生成 " + f"({completed_count}/{total_count})") else: - logger.warning(f"文件 {file_path} 的摘要生成失败 ({completed_count}/{total_count})") + logger.warning(f"文件 {file_path} 的摘要生成失败 " + f"({completed_count}/{total_count})") except (FutureTimeoutError, TimeoutError) as e: - logger.error(f"文件 {file_path} 的摘要获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") + logger.error(f"文件 {file_path} 的摘要获取超时,跳过该文件: " + f"{type(e).__name__} ({completed_count}/{total_count})") try: future.cancel() except Exception as cancel_e: @@ -1199,7 +1217,8 @@ class GitDiffSummarizer: except Exception as e: logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") except (FutureTimeoutError, TimeoutError) as overall_e: - logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") + logger.error(f"整体处理超时({overall_timeout}秒)," + f"已完成{completed_count}/{total_count}个文件") # 取消所有未完成的任务 for future in future_to_file: if not future.done(): @@ -1238,7 +1257,8 @@ class GitDiffSummarizer: # ==================== 主函数 ==================== -def get_agent_summary(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1", +def get_agent_summary(sample_diff, siliconflow_api_key="", + siliconflow_api_base="https://api.siliconflow.cn/v1", model_name=None, base_url=None, backend_type=None, temperature=None, max_workers=None, single_file_timeout=None, total_summary_timeout=None, max_retry=None, max_retry_ollama=None): -- Gitee From 08dcc65f6b27cbbe9b2531bcb44891b65b6e12ab Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Fri, 26 Sep 2025 17:58:38 +0800 Subject: [PATCH 7/8] update for another review --- .../new_create_translation_issue_AI.py | 146 +++++++++++++++++- 1 file changed, 138 insertions(+), 8 deletions(-) diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py index 4e0a52e3b..5d2fac0c6 100755 --- a/ci/tools/translation/new_create_translation_issue_AI.py +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -173,7 +173,7 @@ class GiteeClient: } req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) result: dict | None = send_request(req_args, {}) - return result is None + return result is not None def add_pr_comment(self, owner, repo, number, body): req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) @@ -375,6 +375,78 @@ def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) return issue_body +def generate_issue_body_without_ai_summary(diff_files: list[str], pr_html_url: str) -> str: + """ + 生成不包含AI摘要的issue正文内容 + """ + issue_body = f"## ⚠️ 翻译变更检测\n\n" + issue_body += f"检测到需要翻译的文件变更,但本次变更不包含docs/zh路径下的文件,因此未生成AI摘要。\n\n" + issue_body += f"**变更文件数量**: {len(diff_files)}\n" + issue_body += f"**相关PR**: {pr_html_url}\n\n" + issue_body += f"## 📝 变更文件列表\n\n" + + # 只显示docs/zh路径下的文件 + docs_zh_files = [f for f in diff_files if f.startswith('docs/zh/')] + if docs_zh_files: + for file_path in docs_zh_files: + issue_body += f"- {file_path}\n" + else: + issue_body += f"本次变更未包含docs/zh路径下的文件。\n" + + issue_body += f"\n## 🔗 相关PR链接\n\n" + issue_body += f"- {pr_html_url}\n" + + return issue_body + + +def filter_docs_zh_files(diff_content: str) -> str: + """ + 过滤diff内容,只保留docs/zh路径下的文件变更 + """ + if not diff_content: + return "" + + lines = diff_content.split('\n') + filtered_lines = [] + current_file_section = [] + in_docs_zh_file = False + + for line in lines: + if line.startswith('diff --git'): + # 处理前一个文件 + if in_docs_zh_file and current_file_section: + filtered_lines.extend(current_file_section) + + # 检查新文件是否在docs/zh路径下 + current_file_section = [line] + in_docs_zh_file = False + + # 提取文件路径 + if ' a/' in line and ' b/' in line: + # 找到 a/ 和 b/ 的位置 + a_pos = line.find(' a/') + b_pos = line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = line[a_start:b_pos] + + # 检查是否在docs/zh路径下 + if file_path.startswith('docs/zh/'): + in_docs_zh_file = True + logger.info(f"包含docs/zh路径下的文件: {file_path}") + else: + # 继续当前文件的内容 + current_file_section.append(line) + + # 处理最后一个文件 + if in_docs_zh_file and current_file_section: + filtered_lines.extend(current_file_section) + + return '\n'.join(filtered_lines) + + def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: str, pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str, pr_html_url: str, issue_title_pr_mark: str, @@ -387,6 +459,55 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st if diff_content is None: sys.exit(1) + # 过滤只保留docs/zh路径下的文件 + filtered_diff_content = filter_docs_zh_files(diff_content) + + # 检查是否有docs/zh路径下的文件变更 + if not filtered_diff_content.strip(): + logger.info("没有docs/zh路径下的文件变更,跳过AI摘要生成") + # 创建简单的issue,不包含AI摘要 + diff_files = get_diff_file_list(diff_content) + file_count, zh_file, en_file, need_create_issue = analyze_diff_files( + diff_files, org_item.issue_triggers, issue_title_pr_mark) + + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range" + .format(issue_title_pr_mark)) + return + + if check_same_files_changed(zh_file, en_file): + logger.info("changed the same files in en and zh path, no need to create issue") + return + + need_create_issue_template, need_create_issue_titles = prepare_issue_templates(need_create_issue) + if not need_create_issue_titles: + return + + need_create_issue_list, existed_issue_list = cli.check_issue_exists( + org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) + + if not need_create_issue_list: + feedback_comment = "所有相关的翻译issue已经存在,请检查: {}".format( + ", ".join(existed_issue_list)) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + return + + # 创建不包含AI摘要的简单issue + for need_create_issue_item in need_create_issue_list: + issue_body = generate_issue_body_without_ai_summary(diff_files, pr_html_url) + success = cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, + need_create_issue_item, + need_create_issue_template[need_create_issue_item], issue_body) + if success: + logger.info(f"成功创建issue: {need_create_issue_item}") + else: + logger.error(f"创建issue失败: {need_create_issue_item}") + error_comment = f"创建翻译issue失败: {need_create_issue_item},请手动创建" + cli.add_pr_comment(pr_owner, pr_repo, pr_number, error_comment) + return + diff_files = get_diff_file_list(diff_content) # 分析diff文件 @@ -418,8 +539,8 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) if not need_create_issue_list: - feedback_comment = "issue has already created, please go to check issue: {}".format( - existed_issue_list) + feedback_comment = "所有相关的翻译issue已经存在,请检查: {}".format( + ", ".join(existed_issue_list)) logger.info("Warning: " + feedback_comment) cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) return @@ -442,23 +563,32 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st max_retry_ollama = model_config.get('max_retry_ollama', 1) try: + # 使用过滤后的diff内容生成AI摘要 issue_summary = get_agent_summary( - diff_content, siliconflow_api_key, siliconflow_api_base, + filtered_diff_content, siliconflow_api_key, siliconflow_api_base, model_name=model_name, backend_type=backend_type, temperature=temperature, max_workers=max_workers, single_file_timeout=single_file_timeout, total_summary_timeout=total_summary_timeout, max_retry=max_retry, max_retry_ollama=max_retry_ollama ) issue_body = generate_issue_body(issue_summary, diff_files, pr_html_url) + logger.info("AI Agent成功生成issue内容") except Exception as e: logger.error(f"AI Agent调用失败: {e}") logger.info("回退到传统方式创建issue") # 使用传统方式的简单issue body格式 issue_body = "### Related PR link \n - {}".format(pr_html_url) - cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, - need_create_issue_item, - need_create_issue_template[need_create_issue_item], issue_body) + success = cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, + need_create_issue_item, + need_create_issue_template[need_create_issue_item], issue_body) + if success: + logger.info(f"成功创建issue: {need_create_issue_item}") + else: + logger.error(f"创建issue失败: {need_create_issue_item}") + # 添加PR评论说明创建失败 + error_comment = f"创建翻译issue失败: {need_create_issue_item},请手动创建" + cli.add_pr_comment(pr_owner, pr_repo, pr_number, error_comment) def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, @@ -503,7 +633,7 @@ def main(): pr_owner = args.pr_owner pr_repo = args.pr_repo - pr_number = args.pr_number + pr_number = int(args.pr_number) siliconflow_api_key = args.siliconflow_api_key siliconflow_api_base = args.siliconflow_api_base create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, -- Gitee From e56305221123c32f11fe022875fd6771e2a19da7 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Mon, 29 Sep 2025 10:44:05 +0800 Subject: [PATCH 8/8] update for review --- .../new_create_translation_issue_AI.py | 239 ++++++++---------- 1 file changed, 111 insertions(+), 128 deletions(-) diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py index 5d2fac0c6..6ae647731 100755 --- a/ci/tools/translation/new_create_translation_issue_AI.py +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -81,9 +81,6 @@ class ReqArgs: T = TypeVar('T') -content_type_is_text = "text/plain" -content_type_is_json_dict = {} -content_type_is_json_list = [] def send_request(args: ReqArgs, t: Generic[T]) -> T: @@ -186,11 +183,21 @@ class GiteeClient: def check_only_marks_changed(self, owner, repo, number, check_list): diff_content = self.get_diff_content(owner, repo, number) - deleted_strs, inserted_strs = get_diff_content_list(diff_content) + + # 检查docs/en路径下是否有对应的文件变更 + zh_files_in_en = check_zh_files_also_modified_in_en(diff_content) + + # 只检查docs/zh路径下的变更,过滤掉同时在en下修改的文件 + filtered_diff_content = filter_docs_zh_files(diff_content, zh_files_in_en) + if not filtered_diff_content.strip(): + logger.info('No docs/zh changes found, skip mark change check') + return + + deleted_strs, inserted_strs = get_diff_content_list(filtered_diff_content) if is_only_marks_changed(deleted_strs, inserted_strs, check_list): - logger.warning('Only marks changed, skip the following steps') + logger.warning('Only marks changed in docs/zh files, skip the following steps') sys.exit(1) - logger.info('Not just only marks changed, continue creating issue') + logger.info('Not just only marks changed in docs/zh files, continue creating issue') def get_diff_file_list(diff_content: str) -> list[str]: @@ -271,19 +278,22 @@ def load_config_yaml(yaml_path): return Config(**data) -def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], - issue_title_pr_mark: str) -> tuple[int, list[str], list[str], dict]: +def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], + issue_title_pr_mark: str) -> tuple[int, list[str], dict]: """ - 分析diff文件,识别需要创建issue的文件 - 返回: (文件计数, 中文文件列表, 英文文件列表, 需要创建的issue字典) + 分析diff文件,识别需要创建issue的文件(只处理docs/zh路径下的文件,不包括同时在docs/en下修改的文件) + 返回: (文件计数, 中文文件列表, 需要创建的issue字典) """ file_count = 0 zh_file = [] - en_file = [] need_create_issue = {} - + for trigger in issue_triggers: for diff_file in diff_files: + # 只处理docs/zh路径下的文件 + if not diff_file.startswith('docs/zh/'): + continue + if diff_file.startswith(trigger.trigger_pr_path) and \ diff_file.split('.')[-1] in trigger.file_extension: logger.info("file {} has been changed".format(diff_file)) @@ -293,27 +303,36 @@ def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger] trigger.issue_assignee, "{}({}).".format(trigger.issue_title, issue_title_pr_mark) ] - zh_file.append(diff_file.replace("zh/", "")) - elif "/en" in trigger.trigger_pr_path: - need_create_issue["en"] = [ - trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark) - ] - en_file.append(diff_file.replace("en/", "")) - else: - logger.warning("not a range") - - return file_count, zh_file, en_file, need_create_issue + # 提取相对于docs/zh/的路径 + relative_path = diff_file.replace("docs/zh/", "") + zh_file.append(relative_path) + return file_count, zh_file, need_create_issue -def check_same_files_changed(zh_file: list[str], en_file: list[str]) -> bool: + +def check_zh_files_also_modified_in_en(diff_content: str) -> list[str]: """ - 检查中英文路径下是否修改了相同的文件 + 检查哪些docs/zh文件在docs/en下也有修改 + 返回:同时在docs/zh和docs/en下修改的文件列表(相对于docs/zh/的路径) """ - for z in zh_file: - if z in en_file: - return True - return False + if not diff_content: + return [] + + # 获取所有diff文件 + all_diff_files = get_diff_file_list(diff_content) + + # 获取docs/zh和docs/en下的文件 + zh_files = [f.replace("docs/zh/", "") for f in all_diff_files if f.startswith("docs/zh/")] + en_files = [f.replace("docs/en/", "") for f in all_diff_files if f.startswith("docs/en/")] + + # 找出同时在zh和en下修改的文件 + zh_files_in_en = [] + for zh_file in zh_files: + if zh_file in en_files: + zh_files_in_en.append(zh_file) + logger.info(f"文件 {zh_file} 在docs/zh和docs/en下都有修改,将跳过摘要生成") + + return zh_files_in_en def prepare_issue_templates(need_create_issue: dict) -> tuple[dict, list[str]]: @@ -366,84 +385,81 @@ def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" issue_body += f"**变更文件数量**: {len(diff_files)}\n" issue_body += f"**相关PR**: {pr_html_url}\n\n" - + issue_body += f"## 📝 变更文件列表\n\n" + for file_path in diff_files: + issue_body += f"- {file_path}\n" + issue_body += f"\n" + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成," \ - f"仅供参考,请以实际更改为准。\n\n" + f"仅供参考,请以实际更改为准。\n\n" issue_body += f"## 🔗 相关PR链接\n\n" issue_body += f"- {pr_html_url}\n" return issue_body -def generate_issue_body_without_ai_summary(diff_files: list[str], pr_html_url: str) -> str: - """ - 生成不包含AI摘要的issue正文内容 - """ - issue_body = f"## ⚠️ 翻译变更检测\n\n" - issue_body += f"检测到需要翻译的文件变更,但本次变更不包含docs/zh路径下的文件,因此未生成AI摘要。\n\n" - issue_body += f"**变更文件数量**: {len(diff_files)}\n" - issue_body += f"**相关PR**: {pr_html_url}\n\n" - issue_body += f"## 📝 变更文件列表\n\n" - - # 只显示docs/zh路径下的文件 - docs_zh_files = [f for f in diff_files if f.startswith('docs/zh/')] - if docs_zh_files: - for file_path in docs_zh_files: - issue_body += f"- {file_path}\n" - else: - issue_body += f"本次变更未包含docs/zh路径下的文件。\n" - - issue_body += f"\n## 🔗 相关PR链接\n\n" - issue_body += f"- {pr_html_url}\n" - - return issue_body - - -def filter_docs_zh_files(diff_content: str) -> str: +def filter_docs_zh_files(diff_content: str, exclude_files: list[str] = None) -> str: """ 过滤diff内容,只保留docs/zh路径下的文件变更 + :param exclude_files: 需要排除的文件列表(相对于docs/zh/的路径) """ + if exclude_files is None: + exclude_files = [] + if not diff_content: return "" - + lines = diff_content.split('\n') filtered_lines = [] current_file_section = [] in_docs_zh_file = False - + current_file_path = "" + for line in lines: if line.startswith('diff --git'): # 处理前一个文件 if in_docs_zh_file and current_file_section: - filtered_lines.extend(current_file_section) - + # 检查当前文件是否需要排除 + relative_path = current_file_path.replace("docs/zh/", "") + if relative_path not in exclude_files: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/zh路径下的文件: {current_file_path}") + else: + logger.info(f"排除docs/zh路径下的文件(因为在en下也有修改): {current_file_path}") + # 检查新文件是否在docs/zh路径下 current_file_section = [line] in_docs_zh_file = False - + current_file_path = "" + # 提取文件路径 if ' a/' in line and ' b/' in line: # 找到 a/ 和 b/ 的位置 a_pos = line.find(' a/') b_pos = line.find(' b/') - + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: # 提取a/和b/之间的路径 a_start = a_pos + 3 # 跳过 ' a/' - file_path = line[a_start:b_pos] - + current_file_path = line[a_start:b_pos] + # 检查是否在docs/zh路径下 - if file_path.startswith('docs/zh/'): + if current_file_path.startswith('docs/zh/'): in_docs_zh_file = True - logger.info(f"包含docs/zh路径下的文件: {file_path}") else: # 继续当前文件的内容 current_file_section.append(line) - + # 处理最后一个文件 if in_docs_zh_file and current_file_section: - filtered_lines.extend(current_file_section) - + # 检查当前文件是否需要排除 + relative_path = current_file_path.replace("docs/zh/", "") + if relative_path not in exclude_files: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/zh路径下的文件: {current_file_path}") + else: + logger.info(f"排除docs/zh路径下的文件(因为在en下也有修改): {current_file_path}") + return '\n'.join(filtered_lines) @@ -458,65 +474,36 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) if diff_content is None: sys.exit(1) - - # 过滤只保留docs/zh路径下的文件 - filtered_diff_content = filter_docs_zh_files(diff_content) - - # 检查是否有docs/zh路径下的文件变更 + + # 早期检查:查看diff中是否包含docs/zh路径下的文件变更 + if 'docs/zh/' not in diff_content: + logger.info("diff内容中不包含docs/zh路径下的文件变更,无需创建翻译issue") + return + + # 检查docs/en路径下是否有对应的文件变更 + zh_files_in_en = check_zh_files_also_modified_in_en(diff_content) + if zh_files_in_en: + logger.info(f"发现 {len(zh_files_in_en)} 个在docs/zh和docs/en下同时修改的文件:{zh_files_in_en}") + else: + logger.info("没有发现同时在docs/zh和docs/en下修改的文件") + + # 过滤只保留docs/zh路径下的文件,排除同时在docs/en下修改的文件 + filtered_diff_content = filter_docs_zh_files(diff_content, zh_files_in_en) + + # 检查是否有需要处理的docs/zh路径下的文件变更 if not filtered_diff_content.strip(): - logger.info("没有docs/zh路径下的文件变更,跳过AI摘要生成") - # 创建简单的issue,不包含AI摘要 - diff_files = get_diff_file_list(diff_content) - file_count, zh_file, en_file, need_create_issue = analyze_diff_files( - diff_files, org_item.issue_triggers, issue_title_pr_mark) - - if file_count == 0: - logger.warning( - "NOTE: https://gitee.com/{}/files change files out of translate range" - .format(issue_title_pr_mark)) - return - - if check_same_files_changed(zh_file, en_file): - logger.info("changed the same files in en and zh path, no need to create issue") - return - - need_create_issue_template, need_create_issue_titles = prepare_issue_templates(need_create_issue) - if not need_create_issue_titles: - return - - need_create_issue_list, existed_issue_list = cli.check_issue_exists( - org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) - - if not need_create_issue_list: - feedback_comment = "所有相关的翻译issue已经存在,请检查: {}".format( - ", ".join(existed_issue_list)) - logger.info("Warning: " + feedback_comment) - cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) - return - - # 创建不包含AI摘要的简单issue - for need_create_issue_item in need_create_issue_list: - issue_body = generate_issue_body_without_ai_summary(diff_files, pr_html_url) - success = cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, - need_create_issue_item, - need_create_issue_template[need_create_issue_item], issue_body) - if success: - logger.info(f"成功创建issue: {need_create_issue_item}") - else: - logger.error(f"创建issue失败: {need_create_issue_item}") - error_comment = f"创建翻译issue失败: {need_create_issue_item},请手动创建" - cli.add_pr_comment(pr_owner, pr_repo, pr_number, error_comment) + logger.info("没有需要处理的docs/zh路径下的文件变更,无需创建翻译issue") return - - diff_files = get_diff_file_list(diff_content) - + + diff_files = get_diff_file_list(filtered_diff_content) + logger.info(f"解析出 {len(diff_files)} 个变更文件:{diff_files}") + # 分析diff文件 - file_count, zh_file, en_file, need_create_issue = analyze_diff_files( + file_count, zh_file, need_create_issue = analyze_diff_files( diff_files, org_item.issue_triggers, issue_title_pr_mark) - - # 检查是否修改了相同文件 - changed_same_files = check_same_files_changed(zh_file, en_file) - + + logger.info(f"分析完成:共找到 {file_count} 个需要处理的文件") + # 验证是否需要创建issue if file_count == 0: logger.warning( @@ -524,10 +511,6 @@ def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: st .format(issue_title_pr_mark)) return - if changed_same_files: - logger.info("changed the same files in en and zh path, no need to create issue") - return - # 准备issue模板 need_create_issue_template, need_create_issue_titles = prepare_issue_templates(need_create_issue) -- Gitee