From 98bea8f401b83bf976d0fbb0c545f91141392f68 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Sat, 20 Sep 2025 20:42:21 +0800 Subject: [PATCH 1/2] update ci-bot for auto generating translation comment --- ci/tools/comment/comment_agent.py | 958 +++++++++++++++++++++++++++ ci/tools/comment/create_comment.py | 372 +++++++++++ ci/tools/comment/create_comment.yaml | 38 ++ 3 files changed, 1368 insertions(+) create mode 100644 ci/tools/comment/comment_agent.py create mode 100644 ci/tools/comment/create_comment.py create mode 100644 ci/tools/comment/create_comment.yaml diff --git a/ci/tools/comment/comment_agent.py b/ci/tools/comment/comment_agent.py new file mode 100644 index 00000000..25dbe385 --- /dev/null +++ b/ci/tools/comment/comment_agent.py @@ -0,0 +1,958 @@ +import json +import re +import logging +import urllib.parse +from typing import List, Dict, Any, Optional, Tuple, Literal +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError +from pathlib import Path +import sys +import time +# LangChain imports +from langchain_core.prompts import ChatPromptTemplate, PromptTemplate +from langchain_core.runnables import RunnableLambda, RunnablePassthrough +from pydantic import BaseModel, Field, SecretStr +from langchain_community.llms import Ollama +from langchain_ollama import ChatOllama +from langchain.chains import TransformChain, SequentialChain +from langchain_core.output_parsers import JsonOutputParser +from langchain_openai import ChatOpenAI +import yaml + +# ==================== 配置加载 ==================== + +def load_config(config_file="create_comment.yaml"): + """从YAML文件加载配置""" + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + return config.get('comment_agent', {}) + except FileNotFoundError: + print(f"配置文件 {config_file} 不存在") + raise + except yaml.YAMLError as e: + print(f"解析配置文件时发生错误: {e}") + raise + +# 加载配置 +_config = load_config() + +# ==================== 配置常量 ==================== + +BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') +MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') +MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) +MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) +PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) +SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) +TOTAL_COMMENT_TIMEOUT = _config.get('processing', {}).get('total_comment_timeout', 300) +LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') +SILICONFLOW_API_KEY = '' +SILICONFLOW_API_BASE = '' + +# 配置日志 +logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) +logger = logging.getLogger(__name__) + +# ==================== 数据模型定义 ==================== + +class PRAnalysisResult(BaseModel): + """PR分析结果的结构化输出""" + has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) + text_change_type: Literal["无文本改动", "仅标点符号改动", "英文内容改动", "代码注释改动", "混合改动"] = Field(description="文本改动类型") + has_grammar_errors: bool = Field(description="是否存在语法语病错误", default=False) + grammar_errors: List[str] = Field(description="具体的语法语病错误列表", default=[]) + detailed_analysis: str = Field(description="详细分析说明") + suggestions: List[str] = Field(description="改进建议列表", default=[]) + +class FileTextAnalysis(BaseModel): + """单个文件的文本分析""" + file_path: str = Field(description="文件路径", default="") + has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) + text_lines: List[str] = Field(description="涉及文本改动的行", default=[]) + grammar_issues: List[str] = Field(description="语法问题列表", default=[]) + analysis_details: str = Field(description="分析详情") + +@dataclass +class DiffFileInfo: + """单个文件的diff信息""" + file_path: str + diff_content: str + lines_added: int + lines_deleted: int + +@dataclass +class CommentResult: + """评论生成结果""" + pr_analysis: Optional[PRAnalysisResult] + file_analyses: List[FileTextAnalysis] + processed_files: int + total_files: int + error: Optional[str] = None + +# ==================== Token 统计工具 ==================== + + +# ==================== 工具函数 ==================== + +class DiffParser: + """Git Diff 解析器""" + + @staticmethod + def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: + """ + 解析git diff内容,提取每个文件的改动信息 + + Args: + diff_content: git diff的原始内容 + + Returns: + 包含文件路径和对应diff内容的列表 + """ + + files = [] + current_file = None + current_diff = [] + + lines = diff_content.strip().split('\n') + + for line in lines: + # 匹配文件路径行 + if line.startswith('diff --git'): + # 保存前一个文件的信息 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + # 提取文件路径 - 改进的解析逻辑 + current_file = DiffParser._extract_file_path(line) + if current_file: + current_diff = [line] + else: + current_diff = [] + elif current_file: + current_diff.append(line) + + # 添加最后一个文件 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + return files + + @staticmethod + def _extract_file_path(diff_line: str) -> Optional[str]: + """ + 从git diff行中提取文件路径,支持包含汉字的文件名 + + Args: + diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" + + Returns: + 提取出的文件路径,如果解析失败则返回None + """ + try: + # 方法1: 处理引号包围的路径(Git对特殊字符的处理) + # 格式: diff --git "a/path/to/file" "b/path/to/file" + quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' + quoted_match = re.match(quoted_pattern, diff_line) + + if quoted_match: + file_path_a = quoted_match.group(1) + file_path_b = quoted_match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法2: 使用正则表达式匹配标准的git diff格式 + # 格式: diff --git a/path/to/file b/path/to/file + pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' + match = re.match(pattern, diff_line) + + if match: + file_path_a = match.group(1) + file_path_b = match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法3: 如果正则匹配失败,尝试更简单的解析 + # 处理可能包含空格和特殊字符的文件名 + if ' a/' in diff_line and ' b/' in diff_line: + # 找到 a/ 和 b/ 的位置 + a_pos = diff_line.find(' a/') + b_pos = diff_line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = diff_line[a_start:b_pos] + else: + return None + else: + # 方法4: 最后的备选方案,简单的字符串分割 + parts = diff_line.split() + if len(parts) >= 3: + a_path = parts[2] + if a_path.startswith('a/'): + file_path = a_path[2:] # 移除'a/'前缀 + else: + return None + else: + return None + + # 处理文件名编码 + return DiffParser._decode_file_path(file_path) + + except Exception as e: + logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") + return None + + @staticmethod + def _decode_file_path(file_path: str) -> str: + """ + 解码文件路径,处理各种编码情况 + + Args: + file_path: 原始文件路径 + + Returns: + 解码后的文件路径 + """ + try: + # 首先尝试URL解码,处理Git编码的文件名 + decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') + + # 处理Git对特殊字符的引号包装 + if decoded_path.startswith('"') and decoded_path.endswith('"'): + decoded_path = decoded_path[1:-1] + # Git使用反斜杠转义,需要处理转义序列 + decoded_path = decoded_path.replace('\\"', '"') + decoded_path = decoded_path.replace('\\\\', '\\') + + # 无论是否有引号包装,都尝试处理八进制编码 + # 检查是否包含八进制转义序列 + if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): + decoded_path = DiffParser._decode_octal_sequences(decoded_path) + + return decoded_path + + except Exception as e: + logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") + return file_path + + @staticmethod + def _decode_octal_sequences(text: str) -> str: + """ + 解码文本中的八进制转义序列 + + Args: + text: 包含八进制转义序列的文本 + + Returns: + 解码后的文本 + """ + try: + # 查找八进制转义序列模式:\xxx + pattern = r'\\([0-7]{3})' + + # 找到所有八进制序列 + matches = list(re.finditer(pattern, text)) + if not matches: + return text + + # 收集所有字节值 + result = "" + last_end = 0 + bytes_buffer = [] + + for i, match in enumerate(matches): + # 添加匹配前的文本 + if match.start() > last_end: + # 如果有缓冲的字节,先处理它们 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + result += text[last_end:match.start()] + + # 处理当前八进制序列 + octal_str = match.group(1) + try: + byte_value = int(octal_str, 8) + bytes_buffer.append(byte_value) + except ValueError: + # 如果转换失败,添加原始字符串 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + result += match.group(0) + + last_end = match.end() + + # 检查是否是最后一个匹配或下一个匹配不连续 + is_last = (i == len(matches) - 1) + is_next_non_consecutive = (not is_last and + matches[i + 1].start() != match.end()) + + if is_last or is_next_non_consecutive: + # 处理缓冲的字节 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + # 添加剩余的文本 + if last_end < len(text): + result += text[last_end:] + + return result + + except Exception as e: + logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") + return text + + @staticmethod + def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: + """创建DiffFileInfo对象""" + diff_content = '\n'.join(diff_lines) + lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) + + return DiffFileInfo( + file_path=file_path, + diff_content=diff_content, + lines_added=lines_added, + lines_deleted=lines_deleted + ) + + @staticmethod + def _count_lines_changed(diff_content: str) -> Tuple[int, int]: + """统计git diff中改动的行数""" + lines_added, lines_deleted = 0, 0 + lines = diff_content.strip().split('\n') + + for line in lines: + # 统计新增行(以+开头,但不是+++) + if line.startswith('+') and not line.startswith('+++'): + lines_added += 1 + # 统计删除行(以-开头,但不是---) + elif line.startswith('-') and not line.startswith('---'): + lines_deleted += 1 + + return lines_added, lines_deleted + +# ==================== LangChain 组件 ==================== + +class LLMFactory: + """LLM工厂类""" + + @staticmethod + def create_chat_llm(model_name: str = None, base_url: str = None): + """创建LLM实例""" + if model_name is None: + model_name = MODEL_NAME + + if BACKEND_TYPE == "siliconflow": + return ChatOpenAI( + model=model_name, + api_key=SecretStr(SILICONFLOW_API_KEY), + base_url=SILICONFLOW_API_BASE, + temperature=MODEL_TEMPERATURE + ) + else: + raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + +class PromptTemplates: + """提示模板集合""" + + @staticmethod + def get_file_text_analysis_prompt() -> ChatPromptTemplate: + """获取单文件文本分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", f""" +你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。请忽略中文、格式和代码的审计,专注于识别英文文本变更。 + +注意:请忽略中文、格式和代码的审计,专注于识别英文文本变更。如果文档的变更不涉及英文文本,你只需要输出“不涉及英文改动”即可,不需要额外输出任何分析结果。 +同时:对于专有名词,例如openEuler、GitHub等,你不能将其纳入英文文本变更的纠错范围内,而是应该自动识别专有名词。对于代码的相关变更,也不应该纳入分析内容范围。 + +你需要遵循**能不提修改意见就不提修改意见**的原则进行审查!!! + +请仔细分析这个文件的改动,并按照以下要求进行分析: + +**分析重点:** + +1. 英文文本变更识别: + - 检查是否涉及英文文本内容的改动 + - 区分代码逻辑变更和英文文本内容变更 + - 识别注释、文档字符串、用户显示文本等英文文本内容 + - 标识出具体的英文文本变更行 + +2. 语法错误检测: + - 检查英文文本的语法、拼写错误 + +**分析类型判断:** +- 如果改动不涉及任何英文文本内容,标记为"无英文文本改动" +- 如果涉及代码注释的英文文本变更,标记为"代码注释改动" +- 如果涉及文档、界面文本等英文内容变更,标记为"英文内容改动" + +**语法检查重点:** +- 英文:主谓一致、时态、拼写、标点、语序 + +**输出要求:** +- 如果存在英文文本变更但变更不存在语法问题,则直接输出“不存在语法问题”,不需要任何额外输出 +- 详细列出发现的语法错误(如果有) +- 不能超过100个汉字字符 + + """), + ("human", """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} + + """) + ]) + + @staticmethod + def get_pr_analysis_prompt() -> ChatPromptTemplate: + """获取整体PR分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", """ +你是一个专业的PR审查专家,专门分析Gitee文档仓库的翻译PR中的英文文本变更和语法问题。每条PR都是人工生成的文档改动。 + +请分析所有文件的改动,并生成一个综合评估,要求: + +1. 整体文本变更评估: + - 统计涉及文本变更的文件数量 + - 分析文本变更的类型分布 + - 评估变更的重要性和影响范围 + - 如果文本变更不涉及英文,或涉及英文但使用正确不需要改动,则**直接忽略**,无需对其进行总结 + +2. 语法错误汇总: + - **仅汇总改动中的硬伤,如单词拼写错误、英语语法(时态语态)错误等** + - **对于一些可以优化但称不上错误的点,以最小化改动为原则,选择忽略** + - 提高报错阈值,忽略可优化翻译的点 + - 提供优先修复建议 + +3. 质量评估: + - 对整个PR的文本质量给出评分 + - 分析文本变更的一致性 + - 评估对用户体验的影响 + +4. 改进建议: + - 提供具体的修改建议 + - 推荐最佳实践 + - 建议后续的质量控制措施 + +**输出格式要求:** +- 提供清晰的分析结论 +- 按优先级排列发现的问题 +- 给出可操作的改进建议 + + """), + ("human", """ +各个文件的分析结果: +{file_analyses} + +总文件数: {total_files} +涉及文本变更的文件数: {text_changed_files} + """) + ]) + +class FileTextAnalysisChain: + """单文件文本分析任务链""" + + def __init__(self, llm: ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=FileTextAnalysis) + + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "has_text_changes": "是否涉及英文文本改动(布尔值)", + "text_lines": "涉及文本改动的行(字符串列表)", + "grammar_issues": "语法问题列表(字符串列表)", + "analysis_details": "分析详情(字符串)" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。 + +**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** + +**严格过滤条件:** +1. 如果文档的变更不涉及英文文本,直接标记为"无英文文本改动",无需任何分析 +2. 如果涉及英文文本但语法完全正确,直接标记为"语法正确,无需关注" +3. 如果仅涉及标点符号的微小调整,直接标记为"仅标点符号改动,无需关注" +4. 对于专有名词(如openEuler、GitHub等),自动识别并忽略,不纳入纠错范围 +5. 对于代码相关变更,不纳入分析内容范围 + +**只关注以下明显错误:** +- 明显的单词拼写错误(如:recieve -> receive) +- 严重的语法错误(如:主谓不一致、时态错误) +- 明显的标点符号错误(如:缺少句号、逗号使用错误) +- 明显的语序错误 + +**忽略以下情况:** +- 语法正确但可以优化的表达 +- 风格偏好问题 +- 轻微的标点符号调整 +- 术语选择的差异 +- 表达方式的个人偏好 + +**输出要求:** +- 如果不存在明显错误,直接输出"语法正确,无需关注" +- 只有发现明显错误时才详细列出 +- 不能超过100个汉字字符 +- 遵循"能不提修改意见就不提修改意见"的原则 + +{format_instructions} +""" + human_template = """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def analyze(self, diff_file_info: DiffFileInfo) -> Optional[FileTextAnalysis]: + """分析单个文件的文本变更""" + max_retry = MODEL_MAX_RETRY + for attempt in range(1, max_retry + 1): + # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 + if attempt > 1: + delay = min(attempt * 2, 10) # 递增延迟,最多10秒 + logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") + time.sleep(delay) + + try: + # 构造prompt字符串 + prompt_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + + # 直接调用,简化超时控制 + invoke_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, FileTextAnalysis)): + if isinstance(result, dict): + result = FileTextAnalysis(**result) + + # 检查结果完整性 + if result and hasattr(result, 'analysis_details') and result.analysis_details: + + # 设置准确值 + result.file_path = diff_file_info.file_path + + # 检查是否只关注明显错误 + analysis_text = result.analysis_details.lower() + if any(phrase in analysis_text for phrase in [ + "语法正确,无需关注", + "无英文文本改动", + "仅标点符号改动,无需关注", + "不存在语法问题" + ]): + # 如果无问题,设置has_text_changes为False + result.has_text_changes = False + result.grammar_issues = [] + + return result + + # 结果无效,记录并重试 + logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") + if attempt < max_retry: + continue + except Exception as e: + err_str = str(e) + # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 + is_http_error = False + for code in ["404", "500", "502", "503", "504"]: + if code in err_str: + is_http_error = True + break + if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): + is_http_error = True + if is_http_error: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") + if attempt < max_retry: + time.sleep(10) + continue + else: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") + # 其它异常直接进入下一次重试 + if attempt < max_retry: + logger.info(f"第{attempt}次尝试失败,准备重试...") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") + return None + +class PRAnalysisChain: + """PR整体分析任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=PRAnalysisResult) + + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "has_text_changes": "是否涉及英文文本改动(布尔值)", + "text_change_type": "文本改动类型(字符串)", + "has_grammar_errors": "是否存在语法语病错误(布尔值)", + "grammar_errors": "具体的语法语病错误列表(字符串列表)", + "detailed_analysis": "详细分析说明(字符串)", + "suggestions": "改进建议列表(字符串列表)" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的PR审查专家,专门分析Pull Request中的文本变更和语法问题。 + +**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** + +请基于各个文件的分析结果,生成整个PR的综合评估,要求: + +1. 严格过滤文件: + - 只统计存在明显错误的文件 + - 忽略"语法正确,无需关注"的文件 + - 忽略"无英文文本改动"的文件 + - 忽略"仅标点符号改动,无需关注"的文件 + +2. 只汇总明显错误: + - 仅汇总硬伤:明显的单词拼写错误、严重的语法错误 + - 忽略可优化但称不上错误的点 + - 忽略风格偏好问题 + - 忽略轻微的标点符号调整 + +3. 质量评估: + - 只对存在明显错误的文件进行质量评估 + - 如果所有文件都无问题,直接标记为"无问题文件" + +4. 改进建议: + - 只对存在明显错误的文件提供修改建议 + - 建议优先修复明显的拼写和语法错误 + +**输出格式要求:** +- 如果所有文件都无问题,直接输出"所有文件语法正确,无需关注" +- 只列出存在明显错误的文件 +- 按优先级排列发现的问题 +- 给出可操作的改进建议 + +{format_instructions} +""" + human_template = """ +各个文件的分析结果: +{file_analyses} + +总文件数: {total_files} +涉及文本变更的文件数: {text_changed_files} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def generate(self, file_analyses: List[FileTextAnalysis]) -> Optional[PRAnalysisResult]: + """生成PR整体分析""" + try: + total_files = len(file_analyses) + + # 过滤出只关注存在明显错误的文件 + problematic_files = [] + for analysis in file_analyses: + # 检查是否存在明显错误 + has_obvious_errors = ( + analysis.has_text_changes and + analysis.grammar_issues and + len(analysis.grammar_issues) > 0 and + analysis.analysis_details and + not any(phrase in analysis.analysis_details for phrase in [ + "语法正确,无需关注", + "无英文文本改动", + "仅标点符号改动,无需关注", + "不存在语法问题" + ]) + ) + + if has_obvious_errors: + problematic_files.append(analysis) + + # 如果所有文件都无问题,直接返回无问题结果 + if not problematic_files: + return PRAnalysisResult( + has_text_changes=False, + text_change_type="无文本改动", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="所有文件语法正确,无需关注", + suggestions=[] + ) + + text_changed_files = len(problematic_files) + + file_analyses_info = [] + for analysis in problematic_files: + file_analyses_info.append({ + 'file_path': analysis.file_path, + 'has_text_changes': analysis.has_text_changes, + 'text_lines': analysis.text_lines, + 'grammar_issues': analysis.grammar_issues, + 'analysis_details': analysis.analysis_details + }) + + # 构造prompt字符串 + prompt_args = { + "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "text_changed_files": text_changed_files + } + + # 使用线程池执行器为PR分析添加超时控制 + timeout_executor = None + try: + timeout_executor = ThreadPoolExecutor(max_workers=1) + invoke_args = { + "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "text_changed_files": text_changed_files + } + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, PRAnalysisResult)): + # 如果是dict(来自JsonOutputParser),转换为PRAnalysisResult + if isinstance(result, dict): + result = PRAnalysisResult(**result) + return result + else: + logger.error(f"生成PR分析时返回类型错误: {type(result)}") + return None + except Exception as e: + logger.error(f"生成PR分析时发生错误: {e}") + return None + except Exception as e: + logger.error(f"生成PR分析时发生错误: {e}") + return None + +# ==================== 主处理类 ==================== + +class PRCommentAnalyzer: + """PR评论分析器""" + + def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): + if model_name is None: + model_name = MODEL_NAME + + # 设置siliconflow API配置 + global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE + if siliconflow_api_key: + SILICONFLOW_API_KEY = siliconflow_api_key + if siliconflow_api_base: + SILICONFLOW_API_BASE = siliconflow_api_base + + self.llm = LLMFactory.create_chat_llm(model_name) + self.file_analysis_chain = FileTextAnalysisChain(self.llm) + self.pr_analysis_chain = PRAnalysisChain(self.llm) + + def cleanup(self): + """清理资源,确保程序能正确退出""" + try: + # 清理 LLM 连接 + if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): + self.llm.client.close() + elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): + self.llm._client.close() + + # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 + if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): + try: + # 强制关闭 httpx 客户端 + if hasattr(self.llm.client, '_client'): + self.llm.client._client.close() + except Exception as e: + logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") + + logger.info("资源清理完成") + except Exception as e: + logger.warning(f"清理资源时发生错误: {e}") + + def analyze_pr_diff(self, diff_content: str, max_workers: int = None) -> CommentResult: + if max_workers is None: + max_workers = PROCESSING_MAX_WORKERS + + logger.info("开始解析PR diff...") + files = DiffParser.parse_git_diff(diff_content) + logger.info(f"解析到 {len(files)} 个文件的改动") + if not files: + logger.warning("未找到任何文件改动") + return CommentResult( + pr_analysis=None, + file_analyses=[], + processed_files=0, + total_files=0, + error='未找到任何文件改动' + ) + + logger.info("开始并行处理各个文件的文本分析...") + file_analyses = [] + # 使用更健壮的并发处理机制 + executor = None + try: + executor = ThreadPoolExecutor(max_workers=max_workers) + future_to_file = { + executor.submit(self.file_analysis_chain.analyze, file_info): file_info.file_path + for file_info in files + } + + # 设置更长的整体超时时间,避免与单个文件超时冲突 + overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 + + completed_count = 0 + total_count = len(future_to_file) + + try: + for future in as_completed(future_to_file, timeout=overall_timeout): + file_path = future_to_file[future] + completed_count += 1 + try: + analysis = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 + if analysis: + file_analyses.append(analysis) + logger.info(f"完成文件 {file_path} 的文本分析 ({completed_count}/{total_count})") + else: + logger.warning(f"文件 {file_path} 的文本分析失败 ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"文件 {file_path} 的文本分析获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + except Exception as e: + logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as overall_e: + logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") + # 取消所有未完成的任务 + for future in future_to_file: + if not future.done(): + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消未完成任务时发生错误: {cancel_e}") + finally: + # 确保线程池被正确关闭 + if executor: + try: + executor.shutdown(wait=True) + except Exception as shutdown_e: + logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") + + logger.info(f"成功生成 {len(file_analyses)} 个文件的文本分析") + logger.info("开始生成PR整体分析...") + pr_analysis = None + if file_analyses: + logger.info(f"基于 {len(file_analyses)} 个成功处理的文件生成PR分析...") + try: + pr_analysis = self.pr_analysis_chain.generate(file_analyses) + if pr_analysis: + logger.info("PR整体分析生成成功") + else: + logger.warning("PR整体分析生成失败") + except Exception as e: + logger.error(f"生成PR分析时发生未预期的错误: {e}") + else: + logger.warning("没有成功处理的文件,跳过PR分析生成") + + return CommentResult( + pr_analysis=pr_analysis, + file_analyses=file_analyses, + processed_files=len(file_analyses), + total_files=len(files) + ) + +# ==================== 主函数 ==================== + +def get_comment_analysis(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): + + analyzer = PRCommentAnalyzer(siliconflow_api_key, siliconflow_api_base) + result = None + try: + result = analyzer.analyze_pr_diff(sample_diff) + finally: + # 确保在函数退出前清理资源 + analyzer.cleanup() + + if not result: + print("处理失败,无法获取结果") + return None + + if result.error: + print(f"错误: {result.error}") + + print("\n=== 单文件文本分析 ===") + problematic_files = [f for f in result.file_analyses if f.has_text_changes and f.grammar_issues] + if problematic_files: + for analysis in problematic_files: + print(f"文件: {analysis.file_path}") + print(f"涉及文本变更: {analysis.has_text_changes}") + print(f"文本变更行: {analysis.text_lines}") + print(f"语法问题: {analysis.grammar_issues}") + print(f"分析详情: {analysis.analysis_details}") + print("-" * 50) + else: + print("所有文件语法正确,无需关注") + + print("=== 处理结果 ===") + print(f"总文件数: {result.total_files}") + print(f"成功处理文件数: {result.processed_files}") + + if result.pr_analysis: + print("\n=== PR整体分析 ===") + pr = result.pr_analysis + print(f"涉及文本变更: {pr.has_text_changes}") + print(f"文本变更类型: {pr.text_change_type}") + print(f"存在语法错误: {pr.has_grammar_errors}") + print(f"语法错误列表: {pr.grammar_errors}") + print(f"详细分析: {pr.detailed_analysis}") + print(f"改进建议: {pr.suggestions}") + + + return result + +if __name__ == "__main__": + # 微服务接口逻辑: 传递进来的就是 sample_diff 的内容 + sample_diff = sys.argv[1] + result = get_comment_analysis(sample_diff) + print(result) diff --git a/ci/tools/comment/create_comment.py b/ci/tools/comment/create_comment.py new file mode 100644 index 00000000..481c7a2f --- /dev/null +++ b/ci/tools/comment/create_comment.py @@ -0,0 +1,372 @@ +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic +from comment_agent import get_comment_analysis + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class Org: + org_name: str + comment_target_owner: str + comment_target_repo: str + auto_comment_enabled: bool = field(default=True) + confidence_threshold: float = field(default=0.7) + text_check_enabled: bool = field(default=True) + grammar_check_enabled: bool = field(default=True) + + +@dataclass +class CommentAgentConfig: + backend: dict = field(default_factory=dict) + model: dict = field(default_factory=dict) + processing: dict = field(default_factory=dict) + logging: dict = field(default_factory=dict) + + +@dataclass +class Config: + orgs: list[dict | Org] + comment_agent: dict | CommentAgentConfig = field(default_factory=dict) + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + if isinstance(self.comment_agent, dict) and self.comment_agent: + self.comment_agent = CommentAgentConfig(**self.comment_agent) + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') +content_type_is_text = "text/plain" +content_type_is_json_dict = {} +content_type_is_json_list = [] + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### 🤖 AI审查反馈 \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is not None + + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def generate_comment_content(comment_result, pr_url: str, analysis_status: str = "success") -> str: + """根据分析结果生成评论内容""" + comment_body = "" + + # 根据分析状态添加不同的状态标识 + if analysis_status == "error": + comment_body += "### 分析状态:处理失败\n" + comment_body += "**分析过程中发生错误,无法生成详细反馈。请手动审查文本变更。**\n\n" + elif analysis_status == "low_confidence": + comment_body += "### 分析状态:置信度较低\n" + comment_body += "**当前分析置信度较低,结果仅供参考。建议进行人工审查。**\n\n" + elif analysis_status == "no_text_changes": + comment_body += "### 分析状态:无文本问题\n" + comment_body += "**AI分析结果显示本次PR未发现明显的文本变更或语法问题。无需改动。**\n\n" + elif analysis_status == "no_grammar_errors": + comment_body += "### 分析状态:文本质量良好\n" + comment_body += "**检测到文本变更,但未发现明显的语法错误,文本质量良好。无需改动。**\n\n" + else: # success with issues + comment_body += "### 分析状态:发现需要关注的问题\n" + comment_body += "**AI分析发现了一些文本变更或语法问题,请查看下方详细信息。**\n\n" + + # 如果有分析结果,添加详细信息 + if comment_result and not comment_result.error: + # 如果有PR整体分析 + if comment_result.pr_analysis: + pr_analysis = comment_result.pr_analysis + + # 添加整体评估摘要 + comment_body += "## 整体评估\n" + comment_body += f"- 涉及文本变更: {'是' if pr_analysis.has_text_changes else '否'}\n" + comment_body += f"- 文本变更类型: {pr_analysis.text_change_type}\n" + comment_body += f"- 存在语法错误: {'是' if pr_analysis.has_grammar_errors else '否'}\n\n" + + # 添加详细分析 + if pr_analysis.detailed_analysis: + comment_body += "## 详细分析\n" + comment_body += f"{pr_analysis.detailed_analysis}\n\n" + + # 添加语法错误列表 + if pr_analysis.grammar_errors: + comment_body += "## 语法问题\n" + for i, error in enumerate(pr_analysis.grammar_errors, 1): + comment_body += f"{i}. {error}\n" + comment_body += "\n" + + # 添加改进建议 + if pr_analysis.suggestions: + comment_body += "## 改进建议\n" + for i, suggestion in enumerate(pr_analysis.suggestions, 1): + comment_body += f"{i}. {suggestion}\n" + comment_body += "\n" + + # 添加文件级别的分析结果 + if comment_result.file_analyses: + # comment_body += "## 文件分析\n" + + # 统计有问题的文件 + files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] + files_without_issues = [f for f in comment_result.file_analyses if not f.has_text_changes and not f.grammar_issues] + + if files_with_issues: + comment_body += f"### 需要关注的文件 ({len(files_with_issues)} 个)\n" + for i, file_analysis in enumerate(files_with_issues, 1): + comment_body += f"\n**{i}. {file_analysis.file_path}**\n" + + if file_analysis.has_text_changes: + comment_body += f"- 文本变更: 检测到英文文本改动\n" + if file_analysis.text_lines: + comment_body += f"- 涉及行数: {len(file_analysis.text_lines)} 行\n" + + if file_analysis.grammar_issues: + comment_body += f"- 语法问题: 发现 {len(file_analysis.grammar_issues)} 个问题\n" + for j, issue in enumerate(file_analysis.grammar_issues, 1): + comment_body += f" {j}. {issue}\n" + + if file_analysis.analysis_details: + comment_body += f"- 分析详情: {file_analysis.analysis_details}\n" + + if files_without_issues: + comment_body += f"\n### 无问题的文件 ({len(files_without_issues)} 个)\n" + for file_analysis in files_without_issues: + comment_body += f"- {file_analysis.file_path}\n" + + # 添加处理统计 + # comment_body += f"\n### 处理统计\n" + # comment_body += f"- 总文件数: {comment_result.total_files}\n" + # comment_body += f"- 成功分析: {comment_result.processed_files}\n" + # comment_body += f"- 有文本变更: {len([f for f in comment_result.file_analyses if f.has_text_changes])}\n" + # comment_body += f"- 有语法问题: {len([f for f in comment_result.file_analyses if f.grammar_issues])}\n" + + # 添加免责声明 + comment_body += "## 免责声明\n" + comment_body += "本评论内容基于AI Agent技术自动生成,仅供参考。请开发者根据实际情况进行判断和修改。\n" + + return comment_body + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + siliconflow_api_key: str = "" + siliconflow_api_base: str = "https://api.siliconflow.cn/v1" + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def create_comment_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str): + pr_html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + + for org_item in conf.orgs: + if org_item.org_name != pr_owner: + continue + + if not org_item.auto_comment_enabled: + logger.info(f"组织 {org_item.org_name} 未启用自动评论功能") + continue + + # 移除文件筛选逻辑,对所有PR平等处理 + logger.info("开始对PR进行全面文本分析(不限制文件类型和路径)") + + # 获取diff内容 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + logger.error("无法获取PR的diff内容") + sys.exit(1) + + # 调用AI Agent进行分析 + logger.info("开始进行AI代码审查分析...") + comment_result = get_comment_analysis(diff_content, siliconflow_api_key, siliconflow_api_base) + + if not comment_result: + logger.error("AI分析失败,将发布错误状态评论") + # 创建一个错误结果对象,确保能发布评论 + from comment_agent import CommentResult + comment_result = CommentResult( + pr_analysis=None, + file_analyses=[], + processed_files=0, + total_files=0, + error="AI分析过程失败" + ) + + # 确定分析状态和评论内容 + analysis_status = "success" + + if comment_result.error: + analysis_status = "error" + logger.info("AI分析过程出错,将发布错误状态评论") + elif comment_result.pr_analysis: + pr_analysis = comment_result.pr_analysis + + # 检查是否有文本变更或语法错误 + if pr_analysis.has_text_changes and pr_analysis.has_grammar_errors: + analysis_status = "success" # 有问题,正常处理 + logger.info("检测到文本变更和语法错误,将发布问题报告评论") + elif pr_analysis.has_text_changes and not pr_analysis.has_grammar_errors: + analysis_status = "no_grammar_errors" + logger.info("检测到文本变更但无语法错误,将发布文本质量良好的评论") + elif not pr_analysis.has_text_changes: + analysis_status = "no_text_changes" + logger.info("未检测到文本变更,将发布无文本问题的评论") + else: + analysis_status = "success" + logger.info("检测到需要关注的问题,将发布详细分析评论") + else: + # 如果没有整体分析,检查是否有文件级别的问题 + files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] + if files_with_issues: + analysis_status = "success" + logger.info(f"检测到 {len(files_with_issues)} 个文件有文本问题,将发布文件级别问题评论") + else: + analysis_status = "no_text_changes" + logger.info("未检测到文件级别问题,将发布无问题评论") + + # 总是生成和发布评论 + comment_content = generate_comment_content( + comment_result, + pr_html_url, + analysis_status + ) + + # 发布评论 + success = cli.add_pr_comment(pr_owner, pr_repo, pr_number, comment_content) + if success: + logger.info(f"AI代码审查评论发布成功 - 状态: {analysis_status}") + else: + logger.error(f"AI代码审查评论发布失败 - 状态: {analysis_status}") + + +def main(): + parser = argparse.ArgumentParser(description='Create AI-powered PR comment based on text analysis') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = args.pr_number + siliconflow_api_key = args.siliconflow_api_key + siliconflow_api_base = args.siliconflow_api_base + create_comment_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/comment/create_comment.yaml b/ci/tools/comment/create_comment.yaml new file mode 100644 index 00000000..32ac269a --- /dev/null +++ b/ci/tools/comment/create_comment.yaml @@ -0,0 +1,38 @@ +# Comment Agent Configuration +comment_agent: + # Backend Configuration + backend: + type: "siliconflow" # Options: "ollama" or "siliconflow" + # siliconflow配置现在通过命令行参数传入 + + # Model Configuration + model: + name: "Qwen/Qwen3-8B" + temperature: 0.1 + max_retry: 5 # For siliconflow backend + + # Processing Configuration + processing: + max_workers: 8 # Number of parallel workers for file processing + single_file_timeout: 180 # Timeout for single file analysis (seconds) + total_comment_timeout: 300 # Timeout for total PR analysis (seconds) + + # Logging Configuration + logging: + level: "INFO" + +# PR Comment Configuration +orgs: + - org_name: openeuler + comment_target_owner: openeuler + comment_target_repo: docs + auto_comment_enabled: true + text_check_enabled: true # 是否启用文本变更检测 + grammar_check_enabled: true # 是否启用语法错误检测 + + - org_name: src-openeuler + comment_target_owner: openeuler + comment_target_repo: globalization + auto_comment_enabled: true + text_check_enabled: true + grammar_check_enabled: true \ No newline at end of file -- Gitee From e11aacffb261b30d184868d2396c31fad07c2e63 Mon Sep 17 00:00:00 2001 From: petermouse666 <708975811@qq.com> Date: Fri, 24 Oct 2025 13:48:31 +0800 Subject: [PATCH 2/2] update ci-bot for auto generating comment --- ci/tools/comment/comment_agent.py | 1960 +++++++++++++------------- ci/tools/comment/create_comment.py | 154 +- ci/tools/comment/create_comment.yaml | 2 +- 3 files changed, 1060 insertions(+), 1056 deletions(-) diff --git a/ci/tools/comment/comment_agent.py b/ci/tools/comment/comment_agent.py index 25dbe385..797a2522 100644 --- a/ci/tools/comment/comment_agent.py +++ b/ci/tools/comment/comment_agent.py @@ -1,958 +1,1002 @@ -import json -import re -import logging -import urllib.parse -from typing import List, Dict, Any, Optional, Tuple, Literal -from dataclasses import dataclass -from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError -from pathlib import Path -import sys -import time -# LangChain imports -from langchain_core.prompts import ChatPromptTemplate, PromptTemplate -from langchain_core.runnables import RunnableLambda, RunnablePassthrough -from pydantic import BaseModel, Field, SecretStr -from langchain_community.llms import Ollama -from langchain_ollama import ChatOllama -from langchain.chains import TransformChain, SequentialChain -from langchain_core.output_parsers import JsonOutputParser -from langchain_openai import ChatOpenAI -import yaml - -# ==================== 配置加载 ==================== - -def load_config(config_file="create_comment.yaml"): - """从YAML文件加载配置""" - try: - with open(config_file, 'r', encoding='utf-8') as f: - config = yaml.safe_load(f) - return config.get('comment_agent', {}) - except FileNotFoundError: - print(f"配置文件 {config_file} 不存在") - raise - except yaml.YAMLError as e: - print(f"解析配置文件时发生错误: {e}") - raise - -# 加载配置 -_config = load_config() - -# ==================== 配置常量 ==================== - -BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') -MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') -MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) -MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) -PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) -SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) -TOTAL_COMMENT_TIMEOUT = _config.get('processing', {}).get('total_comment_timeout', 300) -LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') -SILICONFLOW_API_KEY = '' -SILICONFLOW_API_BASE = '' - -# 配置日志 -logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) -logger = logging.getLogger(__name__) - -# ==================== 数据模型定义 ==================== - -class PRAnalysisResult(BaseModel): - """PR分析结果的结构化输出""" - has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) - text_change_type: Literal["无文本改动", "仅标点符号改动", "英文内容改动", "代码注释改动", "混合改动"] = Field(description="文本改动类型") - has_grammar_errors: bool = Field(description="是否存在语法语病错误", default=False) - grammar_errors: List[str] = Field(description="具体的语法语病错误列表", default=[]) - detailed_analysis: str = Field(description="详细分析说明") - suggestions: List[str] = Field(description="改进建议列表", default=[]) - -class FileTextAnalysis(BaseModel): - """单个文件的文本分析""" - file_path: str = Field(description="文件路径", default="") - has_text_changes: bool = Field(description="是否涉及英文文本改动", default=False) - text_lines: List[str] = Field(description="涉及文本改动的行", default=[]) - grammar_issues: List[str] = Field(description="语法问题列表", default=[]) - analysis_details: str = Field(description="分析详情") - -@dataclass -class DiffFileInfo: - """单个文件的diff信息""" - file_path: str - diff_content: str - lines_added: int - lines_deleted: int - -@dataclass -class CommentResult: - """评论生成结果""" - pr_analysis: Optional[PRAnalysisResult] - file_analyses: List[FileTextAnalysis] - processed_files: int - total_files: int - error: Optional[str] = None - -# ==================== Token 统计工具 ==================== - - -# ==================== 工具函数 ==================== - -class DiffParser: - """Git Diff 解析器""" - - @staticmethod - def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: - """ - 解析git diff内容,提取每个文件的改动信息 - - Args: - diff_content: git diff的原始内容 - - Returns: - 包含文件路径和对应diff内容的列表 - """ - - files = [] - current_file = None - current_diff = [] - - lines = diff_content.strip().split('\n') - - for line in lines: - # 匹配文件路径行 - if line.startswith('diff --git'): - # 保存前一个文件的信息 - if current_file and current_diff: - diff_info = DiffParser._create_diff_file_info(current_file, current_diff) - if diff_info: - files.append(diff_info) - - # 提取文件路径 - 改进的解析逻辑 - current_file = DiffParser._extract_file_path(line) - if current_file: - current_diff = [line] - else: - current_diff = [] - elif current_file: - current_diff.append(line) - - # 添加最后一个文件 - if current_file and current_diff: - diff_info = DiffParser._create_diff_file_info(current_file, current_diff) - if diff_info: - files.append(diff_info) - - return files - - @staticmethod - def _extract_file_path(diff_line: str) -> Optional[str]: - """ - 从git diff行中提取文件路径,支持包含汉字的文件名 - - Args: - diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" - - Returns: - 提取出的文件路径,如果解析失败则返回None - """ - try: - # 方法1: 处理引号包围的路径(Git对特殊字符的处理) - # 格式: diff --git "a/path/to/file" "b/path/to/file" - quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' - quoted_match = re.match(quoted_pattern, diff_line) - - if quoted_match: - file_path_a = quoted_match.group(1) - file_path_b = quoted_match.group(2) - # 通常a和b路径相同,使用a路径(旧文件路径) - file_path = file_path_a - else: - # 方法2: 使用正则表达式匹配标准的git diff格式 - # 格式: diff --git a/path/to/file b/path/to/file - pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' - match = re.match(pattern, diff_line) - - if match: - file_path_a = match.group(1) - file_path_b = match.group(2) - # 通常a和b路径相同,使用a路径(旧文件路径) - file_path = file_path_a - else: - # 方法3: 如果正则匹配失败,尝试更简单的解析 - # 处理可能包含空格和特殊字符的文件名 - if ' a/' in diff_line and ' b/' in diff_line: - # 找到 a/ 和 b/ 的位置 - a_pos = diff_line.find(' a/') - b_pos = diff_line.find(' b/') - - if a_pos != -1 and b_pos != -1 and a_pos < b_pos: - # 提取a/和b/之间的路径 - a_start = a_pos + 3 # 跳过 ' a/' - file_path = diff_line[a_start:b_pos] - else: - return None - else: - # 方法4: 最后的备选方案,简单的字符串分割 - parts = diff_line.split() - if len(parts) >= 3: - a_path = parts[2] - if a_path.startswith('a/'): - file_path = a_path[2:] # 移除'a/'前缀 - else: - return None - else: - return None - - # 处理文件名编码 - return DiffParser._decode_file_path(file_path) - - except Exception as e: - logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") - return None - - @staticmethod - def _decode_file_path(file_path: str) -> str: - """ - 解码文件路径,处理各种编码情况 - - Args: - file_path: 原始文件路径 - - Returns: - 解码后的文件路径 - """ - try: - # 首先尝试URL解码,处理Git编码的文件名 - decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') - - # 处理Git对特殊字符的引号包装 - if decoded_path.startswith('"') and decoded_path.endswith('"'): - decoded_path = decoded_path[1:-1] - # Git使用反斜杠转义,需要处理转义序列 - decoded_path = decoded_path.replace('\\"', '"') - decoded_path = decoded_path.replace('\\\\', '\\') - - # 无论是否有引号包装,都尝试处理八进制编码 - # 检查是否包含八进制转义序列 - if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): - decoded_path = DiffParser._decode_octal_sequences(decoded_path) - - return decoded_path - - except Exception as e: - logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") - return file_path - - @staticmethod - def _decode_octal_sequences(text: str) -> str: - """ - 解码文本中的八进制转义序列 - - Args: - text: 包含八进制转义序列的文本 - - Returns: - 解码后的文本 - """ - try: - # 查找八进制转义序列模式:\xxx - pattern = r'\\([0-7]{3})' - - # 找到所有八进制序列 - matches = list(re.finditer(pattern, text)) - if not matches: - return text - - # 收集所有字节值 - result = "" - last_end = 0 - bytes_buffer = [] - - for i, match in enumerate(matches): - # 添加匹配前的文本 - if match.start() > last_end: - # 如果有缓冲的字节,先处理它们 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - result += decoded_bytes - bytes_buffer = [] - except UnicodeDecodeError: - # 如果解码失败,保持原始形式 - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - - result += text[last_end:match.start()] - - # 处理当前八进制序列 - octal_str = match.group(1) - try: - byte_value = int(octal_str, 8) - bytes_buffer.append(byte_value) - except ValueError: - # 如果转换失败,添加原始字符串 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - result += decoded_bytes - bytes_buffer = [] - except UnicodeDecodeError: - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - result += match.group(0) - - last_end = match.end() - - # 检查是否是最后一个匹配或下一个匹配不连续 - is_last = (i == len(matches) - 1) - is_next_non_consecutive = (not is_last and - matches[i + 1].start() != match.end()) - - if is_last or is_next_non_consecutive: - # 处理缓冲的字节 - if bytes_buffer: - try: - decoded_bytes = bytes(bytes_buffer).decode('utf-8') - except UnicodeDecodeError: - # 如果解码失败,保持原始形式 - for byte_val in bytes_buffer: - result += f"\\{oct(byte_val)[2:].zfill(3)}" - bytes_buffer = [] - - # 添加剩余的文本 - if last_end < len(text): - result += text[last_end:] - - return result - - except Exception as e: - logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") - return text - - @staticmethod - def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: - """创建DiffFileInfo对象""" - diff_content = '\n'.join(diff_lines) - lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) - - return DiffFileInfo( - file_path=file_path, - diff_content=diff_content, - lines_added=lines_added, - lines_deleted=lines_deleted - ) - - @staticmethod - def _count_lines_changed(diff_content: str) -> Tuple[int, int]: - """统计git diff中改动的行数""" - lines_added, lines_deleted = 0, 0 - lines = diff_content.strip().split('\n') - - for line in lines: - # 统计新增行(以+开头,但不是+++) - if line.startswith('+') and not line.startswith('+++'): - lines_added += 1 - # 统计删除行(以-开头,但不是---) - elif line.startswith('-') and not line.startswith('---'): - lines_deleted += 1 - - return lines_added, lines_deleted - -# ==================== LangChain 组件 ==================== - -class LLMFactory: - """LLM工厂类""" - - @staticmethod - def create_chat_llm(model_name: str = None, base_url: str = None): - """创建LLM实例""" - if model_name is None: - model_name = MODEL_NAME - - if BACKEND_TYPE == "siliconflow": - return ChatOpenAI( - model=model_name, - api_key=SecretStr(SILICONFLOW_API_KEY), - base_url=SILICONFLOW_API_BASE, - temperature=MODEL_TEMPERATURE - ) - else: - raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") - -class PromptTemplates: - """提示模板集合""" - - @staticmethod - def get_file_text_analysis_prompt() -> ChatPromptTemplate: - """获取单文件文本分析提示模板""" - return ChatPromptTemplate.from_messages([ - ("system", f""" -你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。请忽略中文、格式和代码的审计,专注于识别英文文本变更。 - -注意:请忽略中文、格式和代码的审计,专注于识别英文文本变更。如果文档的变更不涉及英文文本,你只需要输出“不涉及英文改动”即可,不需要额外输出任何分析结果。 -同时:对于专有名词,例如openEuler、GitHub等,你不能将其纳入英文文本变更的纠错范围内,而是应该自动识别专有名词。对于代码的相关变更,也不应该纳入分析内容范围。 - -你需要遵循**能不提修改意见就不提修改意见**的原则进行审查!!! - -请仔细分析这个文件的改动,并按照以下要求进行分析: - -**分析重点:** - -1. 英文文本变更识别: - - 检查是否涉及英文文本内容的改动 - - 区分代码逻辑变更和英文文本内容变更 - - 识别注释、文档字符串、用户显示文本等英文文本内容 - - 标识出具体的英文文本变更行 - -2. 语法错误检测: - - 检查英文文本的语法、拼写错误 - -**分析类型判断:** -- 如果改动不涉及任何英文文本内容,标记为"无英文文本改动" -- 如果涉及代码注释的英文文本变更,标记为"代码注释改动" -- 如果涉及文档、界面文本等英文内容变更,标记为"英文内容改动" - -**语法检查重点:** -- 英文:主谓一致、时态、拼写、标点、语序 - -**输出要求:** -- 如果存在英文文本变更但变更不存在语法问题,则直接输出“不存在语法问题”,不需要任何额外输出 -- 详细列出发现的语法错误(如果有) -- 不能超过100个汉字字符 - - """), - ("human", """ -文件路径: {file_path} - -Git Diff 内容: -{diff_content} - - """) - ]) - - @staticmethod - def get_pr_analysis_prompt() -> ChatPromptTemplate: - """获取整体PR分析提示模板""" - return ChatPromptTemplate.from_messages([ - ("system", """ -你是一个专业的PR审查专家,专门分析Gitee文档仓库的翻译PR中的英文文本变更和语法问题。每条PR都是人工生成的文档改动。 - -请分析所有文件的改动,并生成一个综合评估,要求: - -1. 整体文本变更评估: - - 统计涉及文本变更的文件数量 - - 分析文本变更的类型分布 - - 评估变更的重要性和影响范围 - - 如果文本变更不涉及英文,或涉及英文但使用正确不需要改动,则**直接忽略**,无需对其进行总结 - -2. 语法错误汇总: - - **仅汇总改动中的硬伤,如单词拼写错误、英语语法(时态语态)错误等** - - **对于一些可以优化但称不上错误的点,以最小化改动为原则,选择忽略** - - 提高报错阈值,忽略可优化翻译的点 - - 提供优先修复建议 - -3. 质量评估: - - 对整个PR的文本质量给出评分 - - 分析文本变更的一致性 - - 评估对用户体验的影响 - -4. 改进建议: - - 提供具体的修改建议 - - 推荐最佳实践 - - 建议后续的质量控制措施 - -**输出格式要求:** -- 提供清晰的分析结论 -- 按优先级排列发现的问题 -- 给出可操作的改进建议 - - """), - ("human", """ -各个文件的分析结果: -{file_analyses} - -总文件数: {total_files} -涉及文本变更的文件数: {text_changed_files} - """) - ]) - -class FileTextAnalysisChain: - """单文件文本分析任务链""" - - def __init__(self, llm: ChatOpenAI): - self.llm = llm - - # 创建输出解析器 - self.output_parser = JsonOutputParser(pydantic_object=FileTextAnalysis) - - # 为硅基流动平台添加输出格式说明 - format_instructions = """ -请以JSON格式输出,包含以下字段: -{{ - "has_text_changes": "是否涉及英文文本改动(布尔值)", - "text_lines": "涉及文本改动的行(字符串列表)", - "grammar_issues": "语法问题列表(字符串列表)", - "analysis_details": "分析详情(字符串)" -}} -""" - # 创建新的prompt模板 - system_template = """ -你是一个专业的代码审查和语言专家,专注于分析Gitee文档仓库的翻译PR中的英文文本内容。每条PR都是人工生成的文档改动。 - -**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** - -**严格过滤条件:** -1. 如果文档的变更不涉及英文文本,直接标记为"无英文文本改动",无需任何分析 -2. 如果涉及英文文本但语法完全正确,直接标记为"语法正确,无需关注" -3. 如果仅涉及标点符号的微小调整,直接标记为"仅标点符号改动,无需关注" -4. 对于专有名词(如openEuler、GitHub等),自动识别并忽略,不纳入纠错范围 -5. 对于代码相关变更,不纳入分析内容范围 - -**只关注以下明显错误:** -- 明显的单词拼写错误(如:recieve -> receive) -- 严重的语法错误(如:主谓不一致、时态错误) -- 明显的标点符号错误(如:缺少句号、逗号使用错误) -- 明显的语序错误 - -**忽略以下情况:** -- 语法正确但可以优化的表达 -- 风格偏好问题 -- 轻微的标点符号调整 -- 术语选择的差异 -- 表达方式的个人偏好 - -**输出要求:** -- 如果不存在明显错误,直接输出"语法正确,无需关注" -- 只有发现明显错误时才详细列出 -- 不能超过100个汉字字符 -- 遵循"能不提修改意见就不提修改意见"的原则 - -{format_instructions} -""" - human_template = """ -文件路径: {file_path} - -Git Diff 内容: -{diff_content} -""" - self.prompt = ChatPromptTemplate.from_messages([ - ("system", system_template.format(format_instructions=format_instructions)), - ("human", human_template) - ]) - self.chain = self.prompt | self.llm | self.output_parser - - def analyze(self, diff_file_info: DiffFileInfo) -> Optional[FileTextAnalysis]: - """分析单个文件的文本变更""" - max_retry = MODEL_MAX_RETRY - for attempt in range(1, max_retry + 1): - # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 - if attempt > 1: - delay = min(attempt * 2, 10) # 递增延迟,最多10秒 - logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") - time.sleep(delay) - - try: - # 构造prompt字符串 - prompt_args = { - "file_path": diff_file_info.file_path, - "diff_content": diff_file_info.diff_content - } - - # 直接调用,简化超时控制 - invoke_args = { - "file_path": diff_file_info.file_path, - "diff_content": diff_file_info.diff_content - } - result = self.chain.invoke(invoke_args) - # 验证结果有效性 - if isinstance(result, (dict, FileTextAnalysis)): - if isinstance(result, dict): - result = FileTextAnalysis(**result) - - # 检查结果完整性 - if result and hasattr(result, 'analysis_details') and result.analysis_details: - - # 设置准确值 - result.file_path = diff_file_info.file_path - - # 检查是否只关注明显错误 - analysis_text = result.analysis_details.lower() - if any(phrase in analysis_text for phrase in [ - "语法正确,无需关注", - "无英文文本改动", - "仅标点符号改动,无需关注", - "不存在语法问题" - ]): - # 如果无问题,设置has_text_changes为False - result.has_text_changes = False - result.grammar_issues = [] - - return result - - # 结果无效,记录并重试 - logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") - if attempt < max_retry: - continue - except Exception as e: - err_str = str(e) - # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 - is_http_error = False - for code in ["404", "500", "502", "503", "504"]: - if code in err_str: - is_http_error = True - break - if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and any(code in err_str for code in ["404", "500", "502", "503", "504"]): - is_http_error = True - if is_http_error: - logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e},第{attempt}次尝试,10秒后重试...") - if attempt < max_retry: - time.sleep(10) - continue - else: - logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") - # 其它异常直接进入下一次重试 - if attempt < max_retry: - logger.info(f"第{attempt}次尝试失败,准备重试...") - logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry}次均未获得结构化输出,放弃。") - return None - -class PRAnalysisChain: - """PR整体分析任务链""" - - def __init__(self, llm: ChatOllama | ChatOpenAI): - self.llm = llm - - # 创建输出解析器 - self.output_parser = JsonOutputParser(pydantic_object=PRAnalysisResult) - - # 为硅基流动平台添加输出格式说明 - format_instructions = """ -请以JSON格式输出,包含以下字段: -{{ - "has_text_changes": "是否涉及英文文本改动(布尔值)", - "text_change_type": "文本改动类型(字符串)", - "has_grammar_errors": "是否存在语法语病错误(布尔值)", - "grammar_errors": "具体的语法语病错误列表(字符串列表)", - "detailed_analysis": "详细分析说明(字符串)", - "suggestions": "改进建议列表(字符串列表)" -}} -""" - # 创建新的prompt模板 - system_template = """ -你是一个专业的PR审查专家,专门分析Pull Request中的文本变更和语法问题。 - -**核心原则:只关注必然存在明显错误的地方,其他文件都不需要关注!** - -请基于各个文件的分析结果,生成整个PR的综合评估,要求: - -1. 严格过滤文件: - - 只统计存在明显错误的文件 - - 忽略"语法正确,无需关注"的文件 - - 忽略"无英文文本改动"的文件 - - 忽略"仅标点符号改动,无需关注"的文件 - -2. 只汇总明显错误: - - 仅汇总硬伤:明显的单词拼写错误、严重的语法错误 - - 忽略可优化但称不上错误的点 - - 忽略风格偏好问题 - - 忽略轻微的标点符号调整 - -3. 质量评估: - - 只对存在明显错误的文件进行质量评估 - - 如果所有文件都无问题,直接标记为"无问题文件" - -4. 改进建议: - - 只对存在明显错误的文件提供修改建议 - - 建议优先修复明显的拼写和语法错误 - -**输出格式要求:** -- 如果所有文件都无问题,直接输出"所有文件语法正确,无需关注" -- 只列出存在明显错误的文件 -- 按优先级排列发现的问题 -- 给出可操作的改进建议 - -{format_instructions} -""" - human_template = """ -各个文件的分析结果: -{file_analyses} - -总文件数: {total_files} -涉及文本变更的文件数: {text_changed_files} -""" - self.prompt = ChatPromptTemplate.from_messages([ - ("system", system_template.format(format_instructions=format_instructions)), - ("human", human_template) - ]) - self.chain = self.prompt | self.llm | self.output_parser - - def generate(self, file_analyses: List[FileTextAnalysis]) -> Optional[PRAnalysisResult]: - """生成PR整体分析""" - try: - total_files = len(file_analyses) - - # 过滤出只关注存在明显错误的文件 - problematic_files = [] - for analysis in file_analyses: - # 检查是否存在明显错误 - has_obvious_errors = ( - analysis.has_text_changes and - analysis.grammar_issues and - len(analysis.grammar_issues) > 0 and - analysis.analysis_details and - not any(phrase in analysis.analysis_details for phrase in [ - "语法正确,无需关注", - "无英文文本改动", - "仅标点符号改动,无需关注", - "不存在语法问题" - ]) - ) - - if has_obvious_errors: - problematic_files.append(analysis) - - # 如果所有文件都无问题,直接返回无问题结果 - if not problematic_files: - return PRAnalysisResult( - has_text_changes=False, - text_change_type="无文本改动", - has_grammar_errors=False, - grammar_errors=[], - detailed_analysis="所有文件语法正确,无需关注", - suggestions=[] - ) - - text_changed_files = len(problematic_files) - - file_analyses_info = [] - for analysis in problematic_files: - file_analyses_info.append({ - 'file_path': analysis.file_path, - 'has_text_changes': analysis.has_text_changes, - 'text_lines': analysis.text_lines, - 'grammar_issues': analysis.grammar_issues, - 'analysis_details': analysis.analysis_details - }) - - # 构造prompt字符串 - prompt_args = { - "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), - "total_files": total_files, - "text_changed_files": text_changed_files - } - - # 使用线程池执行器为PR分析添加超时控制 - timeout_executor = None - try: - timeout_executor = ThreadPoolExecutor(max_workers=1) - invoke_args = { - "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), - "total_files": total_files, - "text_changed_files": text_changed_files - } - result = self.chain.invoke(invoke_args) - # 验证结果有效性 - if isinstance(result, (dict, PRAnalysisResult)): - # 如果是dict(来自JsonOutputParser),转换为PRAnalysisResult - if isinstance(result, dict): - result = PRAnalysisResult(**result) - return result - else: - logger.error(f"生成PR分析时返回类型错误: {type(result)}") - return None - except Exception as e: - logger.error(f"生成PR分析时发生错误: {e}") - return None - except Exception as e: - logger.error(f"生成PR分析时发生错误: {e}") - return None - -# ==================== 主处理类 ==================== - -class PRCommentAnalyzer: - """PR评论分析器""" - - def __init__(self, siliconflow_api_key: str = "", siliconflow_api_base: str = "https://api.siliconflow.cn/v1", model_name: str = None, base_url: str = None): - if model_name is None: - model_name = MODEL_NAME - - # 设置siliconflow API配置 - global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE - if siliconflow_api_key: - SILICONFLOW_API_KEY = siliconflow_api_key - if siliconflow_api_base: - SILICONFLOW_API_BASE = siliconflow_api_base - - self.llm = LLMFactory.create_chat_llm(model_name) - self.file_analysis_chain = FileTextAnalysisChain(self.llm) - self.pr_analysis_chain = PRAnalysisChain(self.llm) - - def cleanup(self): - """清理资源,确保程序能正确退出""" - try: - # 清理 LLM 连接 - if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): - self.llm.client.close() - elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): - self.llm._client.close() - - # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 - if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): - try: - # 强制关闭 httpx 客户端 - if hasattr(self.llm.client, '_client'): - self.llm.client._client.close() - except Exception as e: - logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") - - logger.info("资源清理完成") - except Exception as e: - logger.warning(f"清理资源时发生错误: {e}") - - def analyze_pr_diff(self, diff_content: str, max_workers: int = None) -> CommentResult: - if max_workers is None: - max_workers = PROCESSING_MAX_WORKERS - - logger.info("开始解析PR diff...") - files = DiffParser.parse_git_diff(diff_content) - logger.info(f"解析到 {len(files)} 个文件的改动") - if not files: - logger.warning("未找到任何文件改动") - return CommentResult( - pr_analysis=None, - file_analyses=[], - processed_files=0, - total_files=0, - error='未找到任何文件改动' - ) - - logger.info("开始并行处理各个文件的文本分析...") - file_analyses = [] - # 使用更健壮的并发处理机制 - executor = None - try: - executor = ThreadPoolExecutor(max_workers=max_workers) - future_to_file = { - executor.submit(self.file_analysis_chain.analyze, file_info): file_info.file_path - for file_info in files - } - - # 设置更长的整体超时时间,避免与单个文件超时冲突 - overall_timeout = SINGLE_FILE_TIMEOUT * len(files) + 600 # 给每个文件的时间 + 额外缓冲 - - completed_count = 0 - total_count = len(future_to_file) - - try: - for future in as_completed(future_to_file, timeout=overall_timeout): - file_path = future_to_file[future] - completed_count += 1 - try: - analysis = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 - if analysis: - file_analyses.append(analysis) - logger.info(f"完成文件 {file_path} 的文本分析 ({completed_count}/{total_count})") - else: - logger.warning(f"文件 {file_path} 的文本分析失败 ({completed_count}/{total_count})") - except (FutureTimeoutError, TimeoutError) as e: - logger.error(f"文件 {file_path} 的文本分析获取超时,跳过该文件: {type(e).__name__} ({completed_count}/{total_count})") - try: - future.cancel() - except Exception as cancel_e: - logger.warning(f"取消任务时发生错误: {cancel_e}") - except Exception as e: - logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") - except (FutureTimeoutError, TimeoutError) as overall_e: - logger.error(f"整体处理超时({overall_timeout}秒),已完成{completed_count}/{total_count}个文件") - # 取消所有未完成的任务 - for future in future_to_file: - if not future.done(): - try: - future.cancel() - except Exception as cancel_e: - logger.warning(f"取消未完成任务时发生错误: {cancel_e}") - finally: - # 确保线程池被正确关闭 - if executor: - try: - executor.shutdown(wait=True) - except Exception as shutdown_e: - logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") - - logger.info(f"成功生成 {len(file_analyses)} 个文件的文本分析") - logger.info("开始生成PR整体分析...") - pr_analysis = None - if file_analyses: - logger.info(f"基于 {len(file_analyses)} 个成功处理的文件生成PR分析...") - try: - pr_analysis = self.pr_analysis_chain.generate(file_analyses) - if pr_analysis: - logger.info("PR整体分析生成成功") - else: - logger.warning("PR整体分析生成失败") - except Exception as e: - logger.error(f"生成PR分析时发生未预期的错误: {e}") - else: - logger.warning("没有成功处理的文件,跳过PR分析生成") - - return CommentResult( - pr_analysis=pr_analysis, - file_analyses=file_analyses, - processed_files=len(file_analyses), - total_files=len(files) - ) - -# ==================== 主函数 ==================== - -def get_comment_analysis(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): - - analyzer = PRCommentAnalyzer(siliconflow_api_key, siliconflow_api_base) - result = None - try: - result = analyzer.analyze_pr_diff(sample_diff) - finally: - # 确保在函数退出前清理资源 - analyzer.cleanup() - - if not result: - print("处理失败,无法获取结果") - return None - - if result.error: - print(f"错误: {result.error}") - - print("\n=== 单文件文本分析 ===") - problematic_files = [f for f in result.file_analyses if f.has_text_changes and f.grammar_issues] - if problematic_files: - for analysis in problematic_files: - print(f"文件: {analysis.file_path}") - print(f"涉及文本变更: {analysis.has_text_changes}") - print(f"文本变更行: {analysis.text_lines}") - print(f"语法问题: {analysis.grammar_issues}") - print(f"分析详情: {analysis.analysis_details}") - print("-" * 50) - else: - print("所有文件语法正确,无需关注") - - print("=== 处理结果 ===") - print(f"总文件数: {result.total_files}") - print(f"成功处理文件数: {result.processed_files}") - - if result.pr_analysis: - print("\n=== PR整体分析 ===") - pr = result.pr_analysis - print(f"涉及文本变更: {pr.has_text_changes}") - print(f"文本变更类型: {pr.text_change_type}") - print(f"存在语法错误: {pr.has_grammar_errors}") - print(f"语法错误列表: {pr.grammar_errors}") - print(f"详细分析: {pr.detailed_analysis}") - print(f"改进建议: {pr.suggestions}") - - - return result - -if __name__ == "__main__": - # 微服务接口逻辑: 传递进来的就是 sample_diff 的内容 - sample_diff = sys.argv[1] - result = get_comment_analysis(sample_diff) - print(result) +import json +import re +import logging +import urllib.parse +from typing import List, Dict, Any, Optional, Tuple, Literal +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError +from pathlib import Path +import sys +import time +# LangChain imports +from langchain_core.prompts import ChatPromptTemplate, PromptTemplate +from langchain_core.runnables import RunnableLambda, RunnablePassthrough +from pydantic import BaseModel, Field, SecretStr +from langchain_community.llms import Ollama +from langchain_ollama import ChatOllama +from langchain.chains import TransformChain, SequentialChain +from langchain_core.output_parsers import JsonOutputParser +from langchain_openai import ChatOpenAI +import yaml + +# ==================== 配置加载 ==================== + +def load_config(config_file="create_comment.yaml"): + """从YAML文件加载配置""" + try: + with open(config_file, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + return config.get('comment_agent', {}) + except FileNotFoundError: + print(f"配置文件 {config_file} 不存在") + raise + except yaml.YAMLError as e: + print(f"解析配置文件时发生错误: {e}") + raise + +# 加载配置 +_config = load_config() + +# ==================== 配置常量 ==================== + +BACKEND_TYPE = _config.get('backend', {}).get('type', 'siliconflow') +MODEL_NAME = _config.get('model', {}).get('name', 'Qwen/Qwen3-8B') +MODEL_TEMPERATURE = _config.get('model', {}).get('temperature', 0.1) +MODEL_MAX_RETRY = _config.get('model', {}).get('max_retry', 5) +PROCESSING_MAX_WORKERS = _config.get('processing', {}).get('max_workers', 8) +SINGLE_FILE_TIMEOUT = _config.get('processing', {}).get('single_file_timeout', 180) +TOTAL_COMMENT_TIMEOUT = _config.get('processing', {}).get('total_comment_timeout', 300) +LOGGING_LEVEL = _config.get('logging', {}).get('level', 'INFO') +SILICONFLOW_API_KEY = '' +SILICONFLOW_API_BASE = '' + +# 配置日志 +logging.basicConfig(level=getattr(logging, LOGGING_LEVEL.upper())) +logger = logging.getLogger(__name__) + +# ==================== 数据模型定义 ==================== + +class PRAnalysisResult(BaseModel): + """PR分析结果的结构化输出""" + has_text_changes: bool = Field(description="是否有文本变更", default=True) + text_change_type: str = Field(description="文本变更类型", default="") + has_grammar_errors: bool = Field(description="是否存在语法错误", default=False) + grammar_errors: List[str] = Field(description="语法错误列表", default=[]) + detailed_analysis: str = Field(description="详细分析说明") + suggestions: List[str] = Field(description="改进建议列表", default=[]) + +class FileTextAnalysis(BaseModel): + """单个文件的文本分析""" + file_path: str = Field(description="文件路径", default="") + grammar_issues: List[str] = Field(description="语法问题列表", default=[]) + analysis_details: str = Field(description="分析详情") + +@dataclass +class DiffFileInfo: + """单个文件的diff信息""" + file_path: str + diff_content: str + lines_added: int + lines_deleted: int + +@dataclass +class CommentResult: + """评论生成结果""" + pr_analysis: Optional[PRAnalysisResult] + file_analyses: List[FileTextAnalysis] + processed_files: int + total_files: int + error: Optional[str] = None + + +# ==================== 工具函数 ==================== + +class DiffParser: + """Git Diff 解析器""" + + @staticmethod + def filter_docs_en_files(diff_content: str) -> str: + """ + 过滤diff内容,只保留docs/en路径下的文件变更 + """ + if not diff_content: + return "" + + lines = diff_content.split('\n') + filtered_lines = [] + current_file_section = [] + in_docs_en_file = False + current_file_path = "" + + for line in lines: + if line.startswith('diff --git'): + # 处理前一个文件 + if in_docs_en_file and current_file_section: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/en路径下的文件: {current_file_path}") + + # 检查新文件是否在docs/en路径下 + current_file_section = [line] + in_docs_en_file = False + current_file_path = "" + + # 提取文件路径 + if ' a/' in line and ' b/' in line: + # 找到 a/ 和 b/ 的位置 + a_pos = line.find(' a/') + b_pos = line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + current_file_path = line[a_start:b_pos] + + # 检查是否在docs/en路径下 + if current_file_path.startswith('docs/en/'): + in_docs_en_file = True + else: + # 继续当前文件的内容 + current_file_section.append(line) + + # 处理最后一个文件 + if in_docs_en_file and current_file_section: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/en路径下的文件: {current_file_path}") + + return '\n'.join(filtered_lines) + + @staticmethod + def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: + """ + 解析git diff内容,提取每个文件的改动信息 + + Args: + diff_content: git diff的原始内容 + + Returns: + 包含文件路径和对应diff内容的列表 + """ + + files = [] + current_file = None + current_diff = [] + + lines = diff_content.strip().split('\n') + + for line in lines: + # 匹配文件路径行 + if line.startswith('diff --git'): + # 保存前一个文件的信息 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + # 提取文件路径 - 改进的解析逻辑 + current_file = DiffParser._extract_file_path(line) + if current_file: + current_diff = [line] + else: + current_diff = [] + elif current_file: + current_diff.append(line) + + # 添加最后一个文件 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + return files + + @staticmethod + def _extract_file_path(diff_line: str) -> Optional[str]: + """ + 从git diff行中提取文件路径,支持包含汉字的文件名 + + Args: + diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" + + Returns: + 提取出的文件路径,如果解析失败则返回None + """ + try: + # 方法1: 处理引号包围的路径(Git对特殊字符的处理) + # 格式: diff --git "a/path/to/file" "b/path/to/file" + quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' + quoted_match = re.match(quoted_pattern, diff_line) + + if quoted_match: + file_path_a = quoted_match.group(1) + file_path_b = quoted_match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法2: 使用正则表达式匹配标准的git diff格式 + # 格式: diff --git a/path/to/file b/path/to/file + pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' + match = re.match(pattern, diff_line) + + if match: + file_path_a = match.group(1) + file_path_b = match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法3: 如果正则匹配失败,尝试更简单的解析 + # 处理可能包含空格和特殊字符的文件名 + if ' a/' in diff_line and ' b/' in diff_line: + # 找到 a/ 和 b/ 的位置 + a_pos = diff_line.find(' a/') + b_pos = diff_line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = diff_line[a_start:b_pos] + else: + return None + else: + # 方法4: 最后的备选方案,简单的字符串分割 + parts = diff_line.split() + if len(parts) >= 3: + a_path = parts[2] + if a_path.startswith('a/'): + file_path = a_path[2:] # 移除'a/'前缀 + else: + return None + else: + return None + + # 处理文件名编码 + return DiffParser._decode_file_path(file_path) + + except Exception as e: + logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") + return None + + @staticmethod + def _decode_file_path(file_path: str) -> str: + """ + 解码文件路径,处理各种编码情况 + + Args: + file_path: 原始文件路径 + + Returns: + 解码后的文件路径 + """ + try: + # 首先尝试URL解码,处理Git编码的文件名 + decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') + + # 处理Git对特殊字符的引号包装 + if decoded_path.startswith('"') and decoded_path.endswith('"'): + decoded_path = decoded_path[1:-1] + # Git使用反斜杠转义,需要处理转义序列 + decoded_path = decoded_path.replace('\\"', '"') + decoded_path = decoded_path.replace('\\\\', '\\') + + # 无论是否有引号包装,都尝试处理八进制编码 + # 检查是否包含八进制转义序列 + if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): + decoded_path = DiffParser._decode_octal_sequences(decoded_path) + + return decoded_path + + except Exception as e: + logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") + return file_path + + @staticmethod + def _decode_octal_sequences(text: str) -> str: + """ + 解码文本中的八进制转义序列 + + Args: + text: 包含八进制转义序列的文本 + + Returns: + 解码后的文本 + """ + try: + # 查找八进制转义序列模式:\xxx + pattern = r'\\([0-7]{3})' + + # 找到所有八进制序列 + matches = list(re.finditer(pattern, text)) + if not matches: + return text + + # 收集所有字节值 + result = "" + last_end = 0 + bytes_buffer = [] + + for i, match in enumerate(matches): + # 添加匹配前的文本 + if match.start() > last_end: + # 如果有缓冲的字节,先处理它们 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + result += text[last_end:match.start()] + + # 处理当前八进制序列 + octal_str = match.group(1) + try: + byte_value = int(octal_str, 8) + bytes_buffer.append(byte_value) + except ValueError: + # 如果转换失败,添加原始字符串 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + result += match.group(0) + + last_end = match.end() + + # 检查是否是最后一个匹配或下一个匹配不连续 + is_last = (i == len(matches) - 1) + is_next_non_consecutive = (not is_last and + matches[i + 1].start() != match.end()) + + if is_last or is_next_non_consecutive: + # 处理缓冲的字节 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + # 添加剩余的文本 + if last_end < len(text): + result += text[last_end:] + + return result + + except Exception as e: + logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") + return text + + @staticmethod + def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: + """创建DiffFileInfo对象""" + diff_content = '\n'.join(diff_lines) + lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) + + return DiffFileInfo( + file_path=file_path, + diff_content=diff_content, + lines_added=lines_added, + lines_deleted=lines_deleted + ) + + @staticmethod + def _count_lines_changed(diff_content: str) -> Tuple[int, int]: + """统计git diff中改动的行数""" + lines_added, lines_deleted = 0, 0 + lines = diff_content.strip().split('\n') + + for line in lines: + # 统计新增行(以+开头,但不是+++) + if line.startswith('+') and not line.startswith('+++'): + lines_added += 1 + # 统计删除行(以-开头,但不是---) + elif line.startswith('-') and not line.startswith('---'): + lines_deleted += 1 + + return lines_added, lines_deleted + + @staticmethod + def is_punctuation_only_change(diff_content: str) -> bool: + """判断一个 diff 是否仅包含标点/空白改动(不包含英文字母数字层面的实质变化) + + 核心逻辑: + - 提取added_text和removed_text + - 只保留英文字母数字和下划线进行对比 + - 如果这部分相同,说明英文内容没变,只是标点/空白/中文改了 + - 如果这部分不同,说明有英文内容变更,不是"仅标点改动" + + 注意:此函数主要用于过滤纯标点/空白改动,避免对这类改动进行语法检查 + """ + try: + added_parts = [] + removed_parts = [] + for raw_line in diff_content.strip().split('\n'): + if raw_line.startswith('+++') or raw_line.startswith('---'): + continue + if raw_line.startswith('+'): + added_parts.append(raw_line[1:]) + elif raw_line.startswith('-'): + removed_parts.append(raw_line[1:]) + + added_text = '\n'.join(added_parts) + removed_text = '\n'.join(removed_parts) + + # 如果没有改动,返回False + if added_text == removed_text: + return False + + # 只保留英文字母数字和下划线 + def keep_word_chars(s: str) -> str: + return re.sub(r'[^A-Za-z0-9_]', '', s) + + added_word_chars = keep_word_chars(added_text) + removed_word_chars = keep_word_chars(removed_text) + + # 核心判断:如果英文字母数字部分完全相同,才认为是"仅标点改动" + # + # 会被识别为"仅标点改动"(返回True,跳过语法检查): + # 1. 纯标点改动(如逗号改句号) + # 2. 空白改动(如空格、换行) + # 3. 中文标点改动 + # + # 不会被识别为"仅标点改动"(返回False,进入语法检查): + # 1. 新增/删除英文字母 + # 2. 英文单词拼写改动 + # 3. 英文内容的任何实质性改动 + if added_word_chars == removed_word_chars and added_text != removed_text: + # 额外检查:如果两者都没有英文字母数字(纯中文/标点改动) + # 并且原始文本长度差异很大,可能不只是标点改动 + if not added_word_chars and not removed_word_chars: + # 如果都是纯中文/标点,检查长度差异 + # 长度差异超过10个字符,可能是中文内容的实质性改动 + if abs(len(added_text) - len(removed_text)) > 10: + return False + return True + + return False + except Exception as e: + logger.debug(f"判定仅标点改动时发生错误: {e}") + return False + +# ==================== LangChain 组件 ==================== + +class LLMFactory: + """LLM工厂类""" + + @staticmethod + def create_chat_llm(model_name: str = None, base_url: str = None): + """创建LLM实例""" + if model_name is None: + model_name = MODEL_NAME + + if BACKEND_TYPE == "siliconflow": + return ChatOpenAI( + model=model_name, + api_key=SecretStr(SILICONFLOW_API_KEY), + base_url=SILICONFLOW_API_BASE, + temperature=0 # 使用0温度确保最大确定性和一致性 + ) + else: + raise ValueError(f"不支持的后端类型: {BACKEND_TYPE}") + +class FileTextAnalysisChain: + """单文件文本分析任务链""" + + def __init__(self, llm: ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=FileTextAnalysis) + + # 输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "grammar_issues": "语法问题列表(字符串列表,如无问题则为空列表)", + "analysis_details": "分析详情(字符串)" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是英文语法检查专家,专门审查文档中英文文本的明显拼写和语法错误。 + +【核心原则:严格、一致、客观】 +必须对所有文件使用完全相同的判断标准!对同类错误必须给出一致的结论! + +【必须检查的错误类型】 +严格按照以下标准判断,不得有任何主观性: + +1. 明显的拼写错误: + - 常见单词拼写错误(如:recieve → receive, teh → the, seperate → separate) + - 随机字符串/无意义字符序列(如:awfawfwafaw, asvasvasv, xyzabc等) + - 判断标准:如果一个英文字符串不是: + * 技术文库中正确拼写的英文单词 + * 技术术语(如:JSON, API, HTTP) + * 专有名词(如:GitHub, openEuler) + * 缩写词(如:PR, CI, CD) + * 文件名/路径/命令等 + 则认定为拼写错误 + +2. 明显的时态错误: + - He go yesterday → He went yesterday + - She don't went → She didn't go + - 必须是显而易见的时态不匹配 + +3. 严重的主谓不一致: + - They is → They are + - He are → He is + - It is sings → It sings/It is singing + - 必须是显而易见的主谓不匹配 + +4. 其他明显的语法错误: + - 动词形式错误(如:He can goes → He can go) + - 名词单复数错误(如:many book → many books) + - 介词使用错误(如:depend in → depend on) + - 冠词使用错误(如:a apple → an apple) + - 语态使用错误(如:主动语态和被动语态混淆) + +【完全忽略以下内容,直接输出"无需关注"】 +- 任何中文文本(包括中文列表项、中文注释、中文文档) +- 所有格式问题:链接格式、大小写格式、标点符号、空格、缩进、换行 +- 代码/命令/文件名/路径/配置/脚本/Shell命令(如:/etc/yum.conf, npm install) +- Markdown语法:标题、列表、表格、链接、图片 +- 专有名词:GitHub、openEuler、Gitee、GVP、CVE、CWE等 +- 口语化表达或技术文档中的简化表达 +- 缺少冠词的表达(口语化和技术文档中常见且可接受) +- 句子结构简化(技术文档中常见且可接受) + +【判定流程 - 严格执行】 +对于每个新增的英文文本: +1. 是否在代码/配置/路径/命令中?→ 是 → "无需关注" +2. 是否是专有名词/技术术语/缩写?→ 是 → "无需关注" +3. 是否是标准英文单词?→ 否 → 检查是否为随机字符串 +4. 如果是随机无意义字符串(无法识别字符串的含义)→ 报告为拼写错误 +5. 如果是完整句子 → 检查时态、主谓一致性、动词形式、名词单复数、介词、冠词等 +6. 其他情况 → "无需关注" + +【一致性要求】 +对于以下情况必须一致判断: +- awfawfwafaw, asvasvasv, xyzabc 等无意义的随机字符串 → 必须全部识别 +- recieve, teh, seperate 等常见拼写错误 → 必须全部识别 +- 同样的语法错误在不同文件中 → 必须得出相同结论 + +【输出要求】 +- 必须使用中文输出所有分析内容 +- analysis_details字段必须用中文解释问题 +- grammar_issues列表中的每一项都必须用中文描述 +- 对于随机字符串,明确指出"随机无意义字符串"或"拼写错误" +- 保持判断的一致性和可重复性 +- 确保准确完整地识别所有问题类型,不遗漏任何明显的错误 + +{format_instructions} +""" + human_template = """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def analyze(self, diff_file_info: DiffFileInfo) -> Optional[FileTextAnalysis]: + """分析单个文件的文本变更""" + logger.info(f"开始分析文件: {diff_file_info.file_path}") + max_retry = MODEL_MAX_RETRY + + for attempt in range(1, max_retry + 1): + # 重试时采用指数退避策略 + if attempt > 1: + delay = min(2 ** (attempt - 1), 10) # 2, 4, 8, 10, 10... + logger.info(f"第{attempt}次尝试,等待{delay}秒...") + time.sleep(delay) + + try: + # 调用LLM分析 + invoke_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + result = self.chain.invoke(invoke_args) + + # 验证结果有效性 + if isinstance(result, dict): + result = FileTextAnalysis(**result) + + if isinstance(result, FileTextAnalysis) and result.analysis_details: + result.file_path = diff_file_info.file_path + # 确保grammar_issues为列表 + if not result.grammar_issues: + result.grammar_issues = [] + return result + + # 结果无效,重试 + logger.warning(f"分析返回无效结果,第{attempt}/{max_retry}次尝试") + + except Exception as e: + err_str = str(e) + # 判断是否为HTTP错误 + is_http_error = any(code in err_str for code in ["404", "500", "502", "503", "504"]) + + if is_http_error: + logger.error(f"HTTP错误: {e},第{attempt}/{max_retry}次尝试") + if attempt < max_retry: + time.sleep(10) # HTTP错误等待更长时间 + else: + logger.error(f"分析错误: {e},第{attempt}/{max_retry}次尝试") + + logger.error(f"分析文件 {diff_file_info.file_path} 失败,已重试{max_retry}次") + return None + +class PRAnalysisChain: + """PR整体分析任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI): + self.llm = llm + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=PRAnalysisResult) + + # 输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "has_text_changes": "是否有文本变更(布尔值)", + "text_change_type": "文本变更类型(字符串)", + "has_grammar_errors": "是否存在语法错误(布尔值)", + "grammar_errors": "语法错误列表(字符串列表)", + "detailed_analysis": "详细分析说明(字符串)", + "suggestions": "改进建议列表(字符串列表)" +}} +""" + # 创建prompt模板 + system_template = """ +你是PR审查专家,汇总各文件的英文拼写和语法检查结果。 + +【核心任务】 +汇总所有存在语法问题的文件,包括: +- 拼写错误(随机字符串、单词拼写错误) +- 时态错误 +- 主谓不一致 +- 其他明显的语法错误 + +【输出要求】 +- has_text_changes: 如果有任何文本变更则为true,否则为false +- text_change_type: 描述变更类型(如"文本变更且有语法错误"、"文本变更但无语法错误"等) +- has_grammar_errors: 如果存在语法错误则为true,否则为false +- grammar_errors: 所有语法错误的列表(用中文描述) +- detailed_analysis: 简洁明了的分析说明(不超过200字,使用中文) +- suggestions: 改进建议列表(如有问题则提供建议,无问题则为空列表) + +{format_instructions} +""" + human_template = """ +各个文件的分析结果: +{file_analyses} + +总文件数: {total_files} +涉及文本变更的文件数: {text_changed_files} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def generate(self, file_analyses: List[FileTextAnalysis], + total_comment_timeout: int = TOTAL_COMMENT_TIMEOUT) -> Optional[PRAnalysisResult]: + """生成PR整体分析""" + try: + total_files = len(file_analyses) + + # 只保留有语法问题的文件 + problematic_files = [f for f in file_analyses if f.grammar_issues] + + # 如果所有文件都无问题,直接返回 + if not problematic_files: + return PRAnalysisResult( + has_text_changes=True, + text_change_type="文本变更但无语法错误", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="所有文件无问题", + suggestions=[] + ) + + # 构造分析信息 + file_analyses_info = [ + { + 'file_path': f.file_path, + 'grammar_issues': f.grammar_issues, + 'analysis_details': f.analysis_details + } + for f in problematic_files + ] + + # 使用线程池添加超时控制 + with ThreadPoolExecutor(max_workers=1) as executor: + invoke_args = { + "file_analyses": json.dumps(file_analyses_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "text_changed_files": len(problematic_files) + } + + future = executor.submit(self.chain.invoke, invoke_args) + try: + result = future.result(timeout=total_comment_timeout) + except (FutureTimeoutError, TimeoutError): + logger.error(f"生成PR分析超时({total_comment_timeout}秒)") + future.cancel() + return None + + # 处理结果 + if isinstance(result, dict): + result = PRAnalysisResult(**result) + return result if isinstance(result, PRAnalysisResult) else None + + except Exception as e: + logger.error(f"生成PR分析时发生错误: {e}") + return None + +# ==================== 主处理类 ==================== + +class PRCommentAnalyzer: + """PR评论分析器""" + + def __init__(self, siliconflow_api_key: str = "", + siliconflow_api_base: str = "https://api.siliconflow.cn/v1", + model_name: str = None, base_url: str = None): + if model_name is None: + model_name = MODEL_NAME + + # 设置siliconflow API配置 + global SILICONFLOW_API_KEY, SILICONFLOW_API_BASE + if siliconflow_api_key: + SILICONFLOW_API_KEY = siliconflow_api_key + if siliconflow_api_base: + SILICONFLOW_API_BASE = siliconflow_api_base + + self.llm = LLMFactory.create_chat_llm(model_name) + self.file_analysis_chain = FileTextAnalysisChain(self.llm) + self.pr_analysis_chain = PRAnalysisChain(self.llm) + + def cleanup(self): + """清理资源,确保程序能正确退出""" + try: + # 清理 LLM 连接 + if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): + self.llm.client.close() + elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): + self.llm._client.close() + + # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 + if BACKEND_TYPE == "siliconflow" and hasattr(self.llm, 'client'): + try: + # 强制关闭 httpx 客户端 + if hasattr(self.llm.client, '_client'): + self.llm.client._client.close() + except Exception as e: + logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") + + logger.info("资源清理完成") + except Exception as e: + logger.warning(f"清理资源时发生错误: {e}") + + def analyze_pr_diff(self, diff_content: str, max_workers: int = None) -> CommentResult: + if max_workers is None: + max_workers = PROCESSING_MAX_WORKERS + + # 早期检查:查看diff中是否包含docs/en路径下的文件变更 + if 'docs/en/' not in diff_content: + logger.info("diff内容中不包含docs/en路径下的文件变更,无需进行语法检查") + return CommentResult( + pr_analysis=PRAnalysisResult( + has_text_changes=False, + text_change_type="无docs/en路径下的文件变更", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="本次改动不涉及docs/en路径下的文件,无需语法检查", + suggestions=[] + ), + file_analyses=[], + processed_files=0, + total_files=0 + ) + + # 过滤只保留docs/en路径下的文件 + logger.info("过滤diff内容,只保留docs/en路径下的文件...") + filtered_diff_content = DiffParser.filter_docs_en_files(diff_content) + + # 检查是否有需要处理的docs/en路径下的文件变更 + if not filtered_diff_content.strip(): + logger.info("没有需要处理的docs/en路径下的文件变更,无需进行语法检查") + return CommentResult( + pr_analysis=PRAnalysisResult( + has_text_changes=False, + text_change_type="无文本改动", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="过滤后没有docs/en路径下的文件需要检查", + suggestions=[] + ), + file_analyses=[], + processed_files=0, + total_files=0 + ) + + logger.info("开始解析过滤后的PR diff...") + files = DiffParser.parse_git_diff(filtered_diff_content) + logger.info(f"解析到 {len(files)} 个docs/en路径下的文件改动") + # 预过滤:仅标点/空白改动的文件不视为英文改动,跳过后续LLM分析 + filtered_files = [] + skipped_punct_files = 0 + for f in files: + if DiffParser.is_punctuation_only_change(f.diff_content): + skipped_punct_files += 1 + logger.info(f"跳过仅标点/空白改动的文件: {f.file_path}") + continue + filtered_files.append(f) + if skipped_punct_files: + logger.info(f"共有 {skipped_punct_files} 个文件因仅标点/空白改动被忽略") + + # 检查是否有文件需要分析 + if not files: + logger.warning("未找到任何文件改动") + return CommentResult( + pr_analysis=None, + file_analyses=[], + processed_files=0, + total_files=0, + error='未找到任何文件改动' + ) + + # 如果所有文件都被过滤掉了(都是标点/空白改动) + if not filtered_files: + logger.info("所有文件都是标点/空白改动,无需进行语法检查") + return CommentResult( + pr_analysis=PRAnalysisResult( + has_text_changes=False, + text_change_type="无文本改动", + has_grammar_errors=False, + grammar_errors=[], + detailed_analysis="所有改动都是标点或空白改动,无需语法检查", + suggestions=[] + ), + file_analyses=[], + processed_files=0, + total_files=len(files) + ) + + logger.info(f"开始并行处理文件分析 (共{len(filtered_files)}个,并发数{max_workers})") + file_analyses = [] + + # 计算整体超时时间 + batches = (len(filtered_files) + max_workers - 1) // max_workers + overall_timeout = SINGLE_FILE_TIMEOUT * batches + 60 + logger.info(f"整体超时: {overall_timeout}秒") + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务 + future_to_file = { + executor.submit(self.file_analysis_chain.analyze, file_info): file_info.file_path + for file_info in filtered_files + } + + completed_count = 0 + total_count = len(future_to_file) + + try: + for future in as_completed(future_to_file, timeout=overall_timeout): + file_path = future_to_file[future] + completed_count += 1 + + try: + analysis = future.result(timeout=SINGLE_FILE_TIMEOUT) + if analysis: + file_analyses.append(analysis) + logger.info(f"完成 {file_path} ({completed_count}/{total_count})") + else: + logger.warning(f"失败 {file_path} ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError): + logger.error(f"超时 {file_path} ({completed_count}/{total_count})") + future.cancel() + except Exception as e: + logger.error(f"异常 {file_path}: {e} ({completed_count}/{total_count})") + + except (FutureTimeoutError, TimeoutError): + logger.error(f"整体超时({overall_timeout}秒),已完成{completed_count}/{total_count}") + # 取消未完成的任务 + for future in future_to_file: + if not future.done(): + future.cancel() + + logger.info(f"成功生成 {len(file_analyses)} 个文件的文本分析") + logger.info("开始生成PR整体分析...") + pr_analysis = None + if file_analyses: + logger.info(f"基于 {len(file_analyses)} 个成功处理的文件生成PR分析...") + try: + pr_analysis = self.pr_analysis_chain.generate(file_analyses, TOTAL_COMMENT_TIMEOUT) + if pr_analysis: + logger.info("PR整体分析生成成功") + else: + logger.warning("PR整体分析生成失败") + except Exception as e: + logger.error(f"生成PR分析时发生未预期的错误: {e}") + else: + logger.warning("没有成功处理的文件,跳过PR分析生成") + + return CommentResult( + pr_analysis=pr_analysis, + file_analyses=file_analyses, + processed_files=len(file_analyses), + total_files=len(files) + ) + +# ==================== 主函数 ==================== + +def get_comment_analysis(sample_diff, siliconflow_api_key="", siliconflow_api_base="https://api.siliconflow.cn/v1"): + + analyzer = PRCommentAnalyzer(siliconflow_api_key, siliconflow_api_base) + result = None + try: + result = analyzer.analyze_pr_diff(sample_diff) + finally: + # 确保在函数退出前清理资源 + analyzer.cleanup() + + if not result: + print("处理失败,无法获取结果") + return None + + if result.error: + print(f"错误: {result.error}") + + print("\n=== 单文件文本分析 ===") + # 只输出有语法问题的文件 + problematic_files = [f for f in result.file_analyses + if f.grammar_issues and len(f.grammar_issues) > 0] + if problematic_files: + for analysis in problematic_files: + print(f"文件: {analysis.file_path}") + print(f"语法问题: {analysis.grammar_issues}") + print(f"分析详情: {analysis.analysis_details}") + print("-" * 50) + else: + print("所有文件语法正确,无需关注") + + print("=== 处理结果 ===") + print(f"总文件数: {result.total_files}") + print(f"成功处理文件数: {result.processed_files}") + + if result.pr_analysis: + print("\n=== PR整体分析 ===") + pr = result.pr_analysis + print(f"涉及文本变更: {pr.has_text_changes}") + print(f"文本变更类型: {pr.text_change_type}") + print(f"存在语法错误: {pr.has_grammar_errors}") + print(f"语法错误列表: {pr.grammar_errors}") + print(f"详细分析: {pr.detailed_analysis}") + print(f"改进建议: {pr.suggestions}") + + + return result + +if __name__ == "__main__": + # 微服务接口逻辑: 传递进来的就是 sample_diff 的内容 + sample_diff = sys.argv[1] + result = get_comment_analysis(sample_diff) + print(result) diff --git a/ci/tools/comment/create_comment.py b/ci/tools/comment/create_comment.py index 481c7a2f..8c450419 100644 --- a/ci/tools/comment/create_comment.py +++ b/ci/tools/comment/create_comment.py @@ -22,7 +22,6 @@ class Org: comment_target_owner: str comment_target_repo: str auto_comment_enabled: bool = field(default=True) - confidence_threshold: float = field(default=0.7) text_check_enabled: bool = field(default=True) grammar_check_enabled: bool = field(default=True) @@ -142,94 +141,33 @@ def generate_comment_content(comment_result, pr_url: str, analysis_status: str = comment_body = "" # 根据分析状态添加不同的状态标识 - if analysis_status == "error": - comment_body += "### 分析状态:处理失败\n" - comment_body += "**分析过程中发生错误,无法生成详细反馈。请手动审查文本变更。**\n\n" - elif analysis_status == "low_confidence": - comment_body += "### 分析状态:置信度较低\n" - comment_body += "**当前分析置信度较低,结果仅供参考。建议进行人工审查。**\n\n" - elif analysis_status == "no_text_changes": + if analysis_status in ["no_text_changes", "no_grammar_errors"]: + # 统一的默认评论内容 comment_body += "### 分析状态:无文本问题\n" comment_body += "**AI分析结果显示本次PR未发现明显的文本变更或语法问题。无需改动。**\n\n" - elif analysis_status == "no_grammar_errors": - comment_body += "### 分析状态:文本质量良好\n" - comment_body += "**检测到文本变更,但未发现明显的语法错误,文本质量良好。无需改动。**\n\n" else: # success with issues comment_body += "### 分析状态:发现需要关注的问题\n" comment_body += "**AI分析发现了一些文本变更或语法问题,请查看下方详细信息。**\n\n" - # 如果有分析结果,添加详细信息 - if comment_result and not comment_result.error: - # 如果有PR整体分析 - if comment_result.pr_analysis: - pr_analysis = comment_result.pr_analysis - - # 添加整体评估摘要 - comment_body += "## 整体评估\n" - comment_body += f"- 涉及文本变更: {'是' if pr_analysis.has_text_changes else '否'}\n" - comment_body += f"- 文本变更类型: {pr_analysis.text_change_type}\n" - comment_body += f"- 存在语法错误: {'是' if pr_analysis.has_grammar_errors else '否'}\n\n" - - # 添加详细分析 - if pr_analysis.detailed_analysis: - comment_body += "## 详细分析\n" - comment_body += f"{pr_analysis.detailed_analysis}\n\n" - - # 添加语法错误列表 - if pr_analysis.grammar_errors: - comment_body += "## 语法问题\n" - for i, error in enumerate(pr_analysis.grammar_errors, 1): - comment_body += f"{i}. {error}\n" - comment_body += "\n" - - # 添加改进建议 - if pr_analysis.suggestions: - comment_body += "## 改进建议\n" - for i, suggestion in enumerate(pr_analysis.suggestions, 1): - comment_body += f"{i}. {suggestion}\n" - comment_body += "\n" - - # 添加文件级别的分析结果 + # 如果有分析结果,添加文件级别的分析结果(仅输出有语法问题的文件) + if comment_result and analysis_status not in ["no_text_changes", "no_grammar_errors"]: if comment_result.file_analyses: - # comment_body += "## 文件分析\n" - - # 统计有问题的文件 - files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] - files_without_issues = [f for f in comment_result.file_analyses if not f.has_text_changes and not f.grammar_issues] + # 只统计有语法问题的文件 + files_with_issues = [f for f in comment_result.file_analyses + if f.grammar_issues and len(f.grammar_issues) > 0] if files_with_issues: comment_body += f"### 需要关注的文件 ({len(files_with_issues)} 个)\n" for i, file_analysis in enumerate(files_with_issues, 1): comment_body += f"\n**{i}. {file_analysis.file_path}**\n" - - if file_analysis.has_text_changes: - comment_body += f"- 文本变更: 检测到英文文本改动\n" - if file_analysis.text_lines: - comment_body += f"- 涉及行数: {len(file_analysis.text_lines)} 行\n" - - if file_analysis.grammar_issues: - comment_body += f"- 语法问题: 发现 {len(file_analysis.grammar_issues)} 个问题\n" - for j, issue in enumerate(file_analysis.grammar_issues, 1): - comment_body += f" {j}. {issue}\n" - - if file_analysis.analysis_details: - comment_body += f"- 分析详情: {file_analysis.analysis_details}\n" - - if files_without_issues: - comment_body += f"\n### 无问题的文件 ({len(files_without_issues)} 个)\n" - for file_analysis in files_without_issues: - comment_body += f"- {file_analysis.file_path}\n" - - # 添加处理统计 - # comment_body += f"\n### 处理统计\n" - # comment_body += f"- 总文件数: {comment_result.total_files}\n" - # comment_body += f"- 成功分析: {comment_result.processed_files}\n" - # comment_body += f"- 有文本变更: {len([f for f in comment_result.file_analyses if f.has_text_changes])}\n" - # comment_body += f"- 有语法问题: {len([f for f in comment_result.file_analyses if f.grammar_issues])}\n" + comment_body += f"- 语法问题: 发现 {len(file_analysis.grammar_issues)} 个问题\n" + for j, issue in enumerate(file_analysis.grammar_issues, 1): + comment_body += f" {j}. {issue}\n" # 添加免责声明 comment_body += "## 免责声明\n" comment_body += "本评论内容基于AI Agent技术自动生成,仅供参考。请开发者根据实际情况进行判断和修改。\n" + comment_body += "**注意:语法检查仅针对 docs/en/ 路径下的文件进行。**\n" return comment_body @@ -270,38 +208,53 @@ def create_comment_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, p logger.info(f"组织 {org_item.org_name} 未启用自动评论功能") continue - # 移除文件筛选逻辑,对所有PR平等处理 - logger.info("开始对PR进行全面文本分析(不限制文件类型和路径)") - # 获取diff内容 diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) if diff_content is None: logger.error("无法获取PR的diff内容") sys.exit(1) + # 早期检查:查看diff中是否包含docs/en路径下的文件变更 + if 'docs/en/' not in diff_content: + logger.info("diff内容中不包含docs/en路径下的文件变更,发布默认评论") + # 发布默认评论 + comment_content = generate_comment_content( + comment_result=None, + pr_url=pr_html_url, + analysis_status="no_text_changes" + ) + success = cli.add_pr_comment(pr_owner, pr_repo, pr_number, comment_content) + if success: + logger.info("默认评论发布成功(无docs/en路径变更)") + else: + logger.error("默认评论发布失败(无docs/en路径变更)") + continue + + logger.info("检测到docs/en路径下的文件变更,开始进行英文语法检查分析...") + # 调用AI Agent进行分析 logger.info("开始进行AI代码审查分析...") - comment_result = get_comment_analysis(diff_content, siliconflow_api_key, siliconflow_api_base) + comment_result = None + + try: + comment_result = get_comment_analysis(diff_content, siliconflow_api_key, siliconflow_api_base) + except Exception as e: + logger.error(f"AI分析过程发生异常: {e}") + logger.error("AI分析失败,跳过评论发布") + continue # 跳过本次评论,不发布任何内容 if not comment_result: - logger.error("AI分析失败,将发布错误状态评论") - # 创建一个错误结果对象,确保能发布评论 - from comment_agent import CommentResult - comment_result = CommentResult( - pr_analysis=None, - file_analyses=[], - processed_files=0, - total_files=0, - error="AI分析过程失败" - ) + logger.error("AI分析失败,未返回结果,跳过评论发布") + continue # 跳过本次评论,不发布任何内容 + + if comment_result.error: + logger.error(f"AI分析返回错误: {comment_result.error},跳过评论发布") + continue # 跳过本次评论,不发布任何内容 # 确定分析状态和评论内容 analysis_status = "success" - if comment_result.error: - analysis_status = "error" - logger.info("AI分析过程出错,将发布错误状态评论") - elif comment_result.pr_analysis: + if comment_result.pr_analysis: pr_analysis = comment_result.pr_analysis # 检查是否有文本变更或语法错误 @@ -318,14 +271,17 @@ def create_comment_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, p analysis_status = "success" logger.info("检测到需要关注的问题,将发布详细分析评论") else: - # 如果没有整体分析,检查是否有文件级别的问题 - files_with_issues = [f for f in comment_result.file_analyses if f.has_text_changes or f.grammar_issues] + # 如果没有整体分析,检查是否有文件级别的语法问题 + files_with_issues = [ + f for f in comment_result.file_analyses + if f.grammar_issues and len(f.grammar_issues) > 0 + ] if files_with_issues: analysis_status = "success" - logger.info(f"检测到 {len(files_with_issues)} 个文件有文本问题,将发布文件级别问题评论") + logger.info(f"检测到 {len(files_with_issues)} 个文件有语法问题,将发布文件级别问题评论") else: analysis_status = "no_text_changes" - logger.info("未检测到文件级别问题,将发布无问题评论") + logger.info("未检测到文件级别语法问题,将发布无问题评论") # 总是生成和发布评论 comment_content = generate_comment_content( @@ -349,7 +305,9 @@ def main(): parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') parser.add_argument('--pr_number', type=str, required=True, help='the PR number') parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') - parser.add_argument('--siliconflow_api_base', type=str, default="https://api.siliconflow.cn/v1", help='the base URL of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, + default="https://api.siliconflow.cn/v1", + help='the base URL of siliconflow') args = Args() parser.parse_args(args=sys.argv[1:], namespace=args) args.validate() @@ -365,7 +323,9 @@ def main(): pr_number = args.pr_number siliconflow_api_key = args.siliconflow_api_key siliconflow_api_base = args.siliconflow_api_base - create_comment_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base) + create_comment_based_on_pr_diff_and_config( + conf, cli, pr_owner, pr_repo, pr_number, siliconflow_api_key, siliconflow_api_base + ) if __name__ == '__main__': diff --git a/ci/tools/comment/create_comment.yaml b/ci/tools/comment/create_comment.yaml index 32ac269a..4c77c5ed 100644 --- a/ci/tools/comment/create_comment.yaml +++ b/ci/tools/comment/create_comment.yaml @@ -7,7 +7,7 @@ comment_agent: # Model Configuration model: - name: "Qwen/Qwen3-8B" + name: "Qwen/Qwen3-32B" temperature: 0.1 max_retry: 5 # For siliconflow backend -- Gitee