diff --git a/ci/tools/translation/new_create_translation_issue.py b/ci/tools/translation/new_create_translation_issue.py index 5ca3cc0d26d3ce3804bfa9d01c1dfe67b53d40f3..f5e3f8f8af6d1738550b48fd6022afabc5f13f5c 100644 --- a/ci/tools/translation/new_create_translation_issue.py +++ b/ci/tools/translation/new_create_translation_issue.py @@ -1,340 +1,340 @@ -import argparse -import json -import logging -import re -import sys -from dataclasses import dataclass, field -from difflib import SequenceMatcher -from typing import TypeVar, Generic - -import requests -import yaml - -logging.basicConfig(level=logging.INFO, stream=sys.stdout, - format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') -logger = logging.getLogger(__name__) - - -@dataclass -class IssueTrigger: - trigger_pr_path: str - issue_title: str - issue_assignee: str - file_extension: list[str] = field(default_factory=list) - - -@dataclass -class Org: - org_name: str - issue_of_owner: str - issue_of_repo: str - auto_create_issue: bool - issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) - change_content_exclude: list[str] = field(default_factory=list) - - def __post_init__(self): - tmp_issue_triggers: list[IssueTrigger] = [] - for item in self.issue_triggers: - tmp_issue_triggers.append(IssueTrigger(**item)) - self.issue_triggers = tmp_issue_triggers - - -@dataclass -class Config: - orgs: list[dict | Org] - - def __post_init__(self): - tmp_orgs: list[Org] = [] - for item in self.orgs: - tmp_orgs.append(Org(**item)) - self.orgs = tmp_orgs - - -@dataclass -class ReqArgs: - method: str - url: str - headers: dict[str, str] - params: dict[str, str] | None = field(default=None) - data: str | None = field(default=None) - timeout: int = field(default=180) - - -T = TypeVar('T') -content_type_is_text = "text/plain" -content_type_is_json_dict = {} -content_type_is_json_list = [] - - -def send_request(args: ReqArgs, t: Generic[T]) -> T: - error_count = 0 - while error_count < 3: - try: - resp = requests.request(**args.__dict__) - resp.raise_for_status() - if type(t) is dict or type(t) is list: - res_data: dict | list = resp.json() - else: - res_data: str = resp.text - except requests.exceptions.RequestException as e: - if e.response.status_code in [400, 401, 403, 404, 405]: - logger.error("[ERROR] client error {}".format(e)) - break - logger.error("[ERROR] server error: {}".format(e)) - error_count += 1 - else: - logger.info("[OK] [{}], {}".format(args.method, args.url)) - return res_data - return None - - -class GiteeClient: - """ - Gitee OpenAPI 客户端 - """ - headers = { - "Content-Type": "application/json", - "Accept": "application/json", - } - - def __init__(self, developer_token: str): - """ - 构造函数 - :param developer_token: Gitee v5 token - """ - self.headers["Authorization"] = "Bearer {}".format(developer_token) - - def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: - req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) - req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) - result: str | None = send_request(req_args, "") - if result is None: - logger.error("can not get diff file from PR: {}".format(req_url)) - return result - - def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: - req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) - page = 1 - existed_issues = [] - while page <= 200: - query = { - "per_page": 100, - "page": page, - "sort": "created", - "direction": "desc", - } - req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) - result: list | None = send_request(req_args, []) - if result is None: - break - page += 1 - for item in result: - if not issue_titles: - return [], existed_issues - if issue_titles and item.get('title') in issue_titles: - issue_titles.remove(item.get('title')) - existed_issues.append(item.get('html_url')) - if len(result) < 100: - break - return issue_titles, existed_issues - - def create_issue(self, owner, repo, title, assignee, body): - req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) - req_body = { - "repo": repo, - "title": title, - "issue_type": "翻译", - "body": body, - "assignee": assignee, - "push_events": False, - "tag_push_events": False, - "issues_events": False, - } - req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) - result: dict | None = send_request(req_args, {}) - return result is None - - def add_pr_comment(self, owner, repo, number, body): - req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) - req_body = { - "body": "### Translation Feedback \n {} ".format(body) - } - req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) - result: dict | None = send_request(req_args, {}) - return result is None - - def check_only_marks_changed(self, owner, repo, number, check_list): - diff_content = self.get_diff_content(owner, repo, number) - deleted_strs, inserted_strs = get_diff_content_list(diff_content) - if is_only_marks_changed(deleted_strs, inserted_strs, check_list): - logger.warning('Only marks changed, skip the following steps') - sys.exit(1) - logger.info('Not just only marks changed, continue creating issue') - - -def get_diff_file_list(diff_content: str) -> list[str]: - diff_files_list = [] - diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] - for diff_file in diff_files: - if diff_file.endswith('\"'): - d = re.compile(r'/[\d\s\S]+') - diff_file = d.findall(diff_file) - diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') - diff_files_list.append(diff_file) - else: - diff_files_list.append(diff_file) - return diff_files_list - - -def get_diff_content_list(diff_content: str) -> tuple[str, str]: - pieces = diff_content.split('diff --git') - deleted_strs = '' - inserted_strs = '' - for piece in pieces: - start = False - for line in piece.splitlines(): - if line.startswith('@@'): - start = True - continue - if not start: - continue - if line.startswith('-'): - if len(line) == 1: - deleted_strs += '\n' - else: - deleted_strs += line[1:] - elif line.startswith('+'): - if len(line) == 1: - inserted_strs += '\n' - else: - inserted_strs += line[1:] - return deleted_strs, inserted_strs - - -def is_only_marks_changed(a, b, check_list): - s = SequenceMatcher(None, a, b) - for tag, i1, i2, j1, j2 in s.get_opcodes(): - if tag == 'equal': - continue - elif tag in ['delete', 'insert']: - return False - elif tag == 'replace': - deleted = ''.join(a[i1:i2]).strip() - inserted = ''.join(b[j1:j2]).strip() - if deleted not in check_list or inserted not in check_list: - return False - return True - - -class Args: - gitee_token: str - pr_owner: str - pr_repo: str - pr_number: int - - def validate(self): - valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number - if not valid: - logger.error("Invalid Command Arguments") - sys.exit(1) - - -def load_config_yaml(yaml_path): - with open(yaml_path, "r", encoding="utf-8") as config_in: - data = yaml.safe_load(config_in) - - if data is None: - return None - return Config(**data) - - -def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, - pr_number: int): - pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) - for org_item in conf.orgs: - issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) - if org_item.org_name != pr_owner: - continue - if org_item.auto_create_issue: - cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) - file_count = 0 - diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) - if diff_content is None: - sys.exit(1) - diff_files = get_diff_file_list(diff_content) - zh_file = [] - en_file = [] - need_create_issue = {} - for trigger in org_item.issue_triggers: - for diff_file in diff_files: - if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: - logger.info("file {} has been changed".format(diff_file)) - file_count += 1 - if "/zh" in trigger.trigger_pr_path: - need_create_issue["zh"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - zh_file.append(diff_file.replace("zh/", "")) - elif "/en" in trigger.trigger_pr_path: - need_create_issue["en"] = [trigger.issue_assignee, - "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] - en_file.append(diff_file.replace("en/", "")) - else: - logger.warning("not a range") - changed_same_files = False - for z in zh_file: - if z in en_file: - changed_same_files = True - else: - changed_same_files = False - if file_count == 0: - logger.warning( - "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) - return - if changed_same_files: - logger.info("changed the same files in en and zh path, no need to create issue") - return - - need_create_issue_template = {} - need_create_issue_titles = [] - for issue_item in need_create_issue: - need_create_issue_titles.append(need_create_issue[issue_item][1]) - need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] - if need_create_issue_titles: - need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, - org_item.issue_of_repo, - need_create_issue_titles) - if not need_create_issue_list: - feedback_comment = "issue has already created, please go to check issue: {}".format( - existed_issue_list) - logger.info("Warning: " + feedback_comment) - cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) - for need_create_issue_item in need_create_issue_list: - cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, - need_create_issue_template[need_create_issue_item], - "### Related PR link \n - {}".format(pr__html_url)) - - -def main(): - parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') - parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') - parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') - parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') - parser.add_argument('--pr_number', type=str, required=True, help='the PR number') - args = Args() - parser.parse_args(args=sys.argv[1:], namespace=args) - args.validate() - - exec_py = sys.argv[0] - config_yaml_path = exec_py[:-2] + 'yaml' - conf = load_config_yaml(config_yaml_path) - - cli = GiteeClient(args.gitee_token) - - pr_owner = args.pr_owner - pr_repo = args.pr_repo - pr_number = args.pr_number - create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number) - - -if __name__ == '__main__': - main() +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class IssueTrigger: + trigger_pr_path: str + issue_title: str + issue_assignee: str + file_extension: list[str] = field(default_factory=list) + + +@dataclass +class Org: + org_name: str + issue_of_owner: str + issue_of_repo: str + auto_create_issue: bool + issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) + change_content_exclude: list[str] = field(default_factory=list) + + def __post_init__(self): + tmp_issue_triggers: list[IssueTrigger] = [] + for item in self.issue_triggers: + tmp_issue_triggers.append(IssueTrigger(**item)) + self.issue_triggers = tmp_issue_triggers + + +@dataclass +class Config: + orgs: list[dict | Org] + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') +content_type_is_text = "text/plain" +content_type_is_json_dict = {} +content_type_is_json_list = [] + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def check_issue_exists(self, owner: str, repo: str, issue_titles: list[str]) -> tuple[list[str], list[str]]: + req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) + page = 1 + existed_issues = [] + while page <= 200: + query = { + "per_page": 100, + "page": page, + "sort": "created", + "direction": "desc", + } + req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) + result: list | None = send_request(req_args, []) + if result is None: + break + page += 1 + for item in result: + if not issue_titles: + return [], existed_issues + if issue_titles and item.get('title') in issue_titles: + issue_titles.remove(item.get('title')) + existed_issues.append(item.get('html_url')) + if len(result) < 100: + break + return issue_titles, existed_issues + + def create_issue(self, owner, repo, title, assignee, body): + req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) + req_body = { + "repo": repo, + "title": title, + "issue_type": "翻译", + "body": body, + "assignee": assignee, + "push_events": False, + "tag_push_events": False, + "issues_events": False, + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### Translation Feedback \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def check_only_marks_changed(self, owner, repo, number, check_list): + diff_content = self.get_diff_content(owner, repo, number) + deleted_strs, inserted_strs = get_diff_content_list(diff_content) + if is_only_marks_changed(deleted_strs, inserted_strs, check_list): + logger.warning('Only marks changed, skip the following steps') + sys.exit(1) + logger.info('Not just only marks changed, continue creating issue') + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def get_diff_content_list(diff_content: str) -> tuple[str, str]: + pieces = diff_content.split('diff --git') + deleted_strs = '' + inserted_strs = '' + for piece in pieces: + start = False + for line in piece.splitlines(): + if line.startswith('@@'): + start = True + continue + if not start: + continue + if line.startswith('-'): + if len(line) == 1: + deleted_strs += '\n' + else: + deleted_strs += line[1:] + elif line.startswith('+'): + if len(line) == 1: + inserted_strs += '\n' + else: + inserted_strs += line[1:] + return deleted_strs, inserted_strs + + +def is_only_marks_changed(a, b, check_list): + s = SequenceMatcher(None, a, b) + for tag, i1, i2, j1, j2 in s.get_opcodes(): + if tag == 'equal': + continue + elif tag in ['delete', 'insert']: + return False + elif tag == 'replace': + deleted = ''.join(a[i1:i2]).strip() + inserted = ''.join(b[j1:j2]).strip() + if deleted not in check_list or inserted not in check_list: + return False + return True + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int): + pr__html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + for org_item in conf.orgs: + issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + if org_item.org_name != pr_owner: + continue + if org_item.auto_create_issue: + cli.check_only_marks_changed(pr_owner, pr_repo, pr_number, org_item.change_content_exclude) + file_count = 0 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + sys.exit(1) + diff_files = get_diff_file_list(diff_content) + zh_file = [] + en_file = [] + need_create_issue = {} + for trigger in org_item.issue_triggers: + for diff_file in diff_files: + if diff_file.startswith(trigger.trigger_pr_path) and diff_file.split('.')[-1] in trigger.file_extension: + logger.info("file {} has been changed".format(diff_file)) + file_count += 1 + if "/zh" in trigger.trigger_pr_path: + need_create_issue["zh"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + zh_file.append(diff_file.replace("zh/", "")) + elif "/en" in trigger.trigger_pr_path: + need_create_issue["en"] = [trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark)] + en_file.append(diff_file.replace("en/", "")) + else: + logger.warning("not a range") + changed_same_files = False + for z in zh_file: + if z in en_file: + changed_same_files = True + else: + changed_same_files = False + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range".format(issue_title_pr_mark)) + return + if changed_same_files: + logger.info("changed the same files in en and zh path, no need to create issue") + return + + need_create_issue_template = {} + need_create_issue_titles = [] + for issue_item in need_create_issue: + need_create_issue_titles.append(need_create_issue[issue_item][1]) + need_create_issue_template[need_create_issue[issue_item][1]] = need_create_issue[issue_item][0] + if need_create_issue_titles: + need_create_issue_list, existed_issue_list = cli.check_issue_exists(org_item.issue_of_owner, + org_item.issue_of_repo, + need_create_issue_titles) + if not need_create_issue_list: + feedback_comment = "issue has already created, please go to check issue: {}".format( + existed_issue_list) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + for need_create_issue_item in need_create_issue_list: + cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_item, + need_create_issue_template[need_create_issue_item], + "### Related PR link \n - {}".format(pr__html_url)) + + +def main(): + parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = args.pr_number + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/translation/new_create_translation_issue.yaml b/ci/tools/translation/new_create_translation_issue.yaml index a58ebcc6ef49761daa02f88dd8078fe12e066fd7..07e6c76d0a2c0272f9ed587e2d7cb96d414a5190 100644 --- a/ci/tools/translation/new_create_translation_issue.yaml +++ b/ci/tools/translation/new_create_translation_issue.yaml @@ -1,22 +1,22 @@ -orgs: - - org_name: openeuler - issue_of_owner: openeuler - issue_of_repo: globalization - auto_create_issue: true - issue_triggers: - - trigger_pr_path: 'docs/zh' - issue_title: "[Auto] This is an English translation issue for the PR" - issue_assignee: judithsq - file_extension: [ doc, md, json ] - change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] - - - org_name: src-openeuler - issue_of_owner: openeuler - issue_of_repo: globalization - auto_create_issue: true - issue_triggers: - - trigger_pr_path: 'docs/zh' - issue_title: "[Auto] This is an English translation issue for the PR" - issue_assignee: judithsq - file_extension: [ doc, md, json ] +orgs: + - org_name: openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] + + - org_name: src-openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、' ] \ No newline at end of file diff --git a/ci/tools/translation/new_create_translation_issue_AI.py b/ci/tools/translation/new_create_translation_issue_AI.py new file mode 100755 index 0000000000000000000000000000000000000000..6ae64773192c9b52265c426f10d031d523430f9f --- /dev/null +++ b/ci/tools/translation/new_create_translation_issue_AI.py @@ -0,0 +1,627 @@ +# ==================== 常量定义 ==================== + +# Issue类型常量 +ISSUE_TYPE_TRANSLATION = "翻译" + +# ==================== 数据模型定义 ==================== + +import argparse +import json +import logging +import re +import sys +from dataclasses import dataclass, field +from difflib import SequenceMatcher +from typing import TypeVar, Generic +from translation_agent import get_agent_summary + +import requests +import yaml + +logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format='%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s') +logger = logging.getLogger(__name__) + + +@dataclass +class IssueTrigger: + trigger_pr_path: str + issue_title: str + issue_assignee: str + file_extension: list[str] = field(default_factory=list) + + +@dataclass +class Org: + org_name: str + issue_of_owner: str + issue_of_repo: str + auto_create_issue: bool + issue_triggers: list[dict | IssueTrigger] = field(default_factory=list) + change_content_exclude: list[str] = field(default_factory=list) + + def __post_init__(self): + tmp_issue_triggers: list[IssueTrigger] = [] + for item in self.issue_triggers: + tmp_issue_triggers.append(IssueTrigger(**item)) + self.issue_triggers = tmp_issue_triggers + + +@dataclass +class TranslationAgentConfig: + backend: dict = field(default_factory=dict) + model: dict = field(default_factory=dict) + processing: dict = field(default_factory=dict) + logging: dict = field(default_factory=dict) + + +@dataclass +class Config: + orgs: list[dict | Org] + translation_agent: dict | TranslationAgentConfig = field(default_factory=dict) + + def __post_init__(self): + tmp_orgs: list[Org] = [] + for item in self.orgs: + tmp_orgs.append(Org(**item)) + self.orgs = tmp_orgs + + if isinstance(self.translation_agent, dict) and self.translation_agent: + self.translation_agent = TranslationAgentConfig(**self.translation_agent) + + +@dataclass +class ReqArgs: + method: str + url: str + headers: dict[str, str] + params: dict[str, str] | None = field(default=None) + data: str | None = field(default=None) + timeout: int = field(default=180) + + +T = TypeVar('T') + + +def send_request(args: ReqArgs, t: Generic[T]) -> T: + error_count = 0 + while error_count < 3: + try: + resp = requests.request(**args.__dict__) + resp.raise_for_status() + if type(t) is dict or type(t) is list: + res_data: dict | list = resp.json() + else: + res_data: str = resp.text + except requests.exceptions.RequestException as e: + if e.response.status_code in [400, 401, 403, 404, 405]: + logger.error("[ERROR] client error {}".format(e)) + break + logger.error("[ERROR] server error: {}".format(e)) + error_count += 1 + else: + logger.info("[OK] [{}], {}".format(args.method, args.url)) + return res_data + return None + + +class GiteeClient: + """ + Gitee OpenAPI 客户端 + """ + headers = { + "Content-Type": "application/json", + "Accept": "application/json", + } + + def __init__(self, developer_token: str): + """ + 构造函数 + :param developer_token: Gitee v5 token + """ + self.headers["Authorization"] = "Bearer {}".format(developer_token) + + def get_diff_content(self, owner: str, repo: str, number: int) -> str | None: + req_url = "https://gitee.com/{}/{}/pulls/{}.diff".format(owner, repo, number) + req_args = ReqArgs(method="GET", url=req_url, headers=self.headers) + result: str | None = send_request(req_args, "") + if result is None: + logger.error("can not get diff file from PR: {}".format(req_url)) + return result + + def check_issue_exists(self, owner: str, repo: str, + issue_titles: list[str]) -> tuple[list[str], list[str]]: + req_url = "https://gitee.com/api/v5/repos/{}/{}/issues".format(owner, repo) + page = 1 + existed_issues = [] + while page <= 200: + query = { + "per_page": 100, + "page": page, + "sort": "created", + "direction": "desc", + } + req_args = ReqArgs(method="GET", url=req_url, params=query, headers=self.headers) + result: list | None = send_request(req_args, []) + if result is None: + break + page += 1 + for item in result: + if not issue_titles: + return [], existed_issues + if issue_titles and item.get('title') in issue_titles: + issue_titles.remove(item.get('title')) + existed_issues.append(item.get('html_url')) + if len(result) < 100: + break + return issue_titles, existed_issues + + def create_issue(self, owner, repo, title, assignee, body): + req_url = "https://gitee.com/api/v5/repos/{}/issues".format(owner) + req_body = { + "repo": repo, + "title": title, + "issue_type": ISSUE_TYPE_TRANSLATION, + "body": body, + "assignee": assignee, + "push_events": False, + "tag_push_events": False, + "issues_events": False, + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is not None + + def add_pr_comment(self, owner, repo, number, body): + req_url = 'https://gitee.com/api/v5/repos/{}/{}/pulls/{}/comments'.format(owner, repo, number) + req_body = { + "body": "### Translation Feedback \n {} ".format(body) + } + req_args = ReqArgs(method="POST", url=req_url, headers=self.headers, data=json.dumps(req_body)) + result: dict | None = send_request(req_args, {}) + return result is None + + def check_only_marks_changed(self, owner, repo, number, check_list): + diff_content = self.get_diff_content(owner, repo, number) + + # 检查docs/en路径下是否有对应的文件变更 + zh_files_in_en = check_zh_files_also_modified_in_en(diff_content) + + # 只检查docs/zh路径下的变更,过滤掉同时在en下修改的文件 + filtered_diff_content = filter_docs_zh_files(diff_content, zh_files_in_en) + if not filtered_diff_content.strip(): + logger.info('No docs/zh changes found, skip mark change check') + return + + deleted_strs, inserted_strs = get_diff_content_list(filtered_diff_content) + if is_only_marks_changed(deleted_strs, inserted_strs, check_list): + logger.warning('Only marks changed in docs/zh files, skip the following steps') + sys.exit(1) + logger.info('Not just only marks changed in docs/zh files, continue creating issue') + + +def get_diff_file_list(diff_content: str) -> list[str]: + diff_files_list = [] + diff_files = [x.split(' ')[0][2:] for x in diff_content.split('diff --git ')[1:]] + for diff_file in diff_files: + if diff_file.endswith('\"'): + d = re.compile(r'/[\d\s\S]+') + diff_file = d.findall(diff_file) + diff_file = diff_file[0].replace('/', '', 1).replace('\"', '') + diff_files_list.append(diff_file) + else: + diff_files_list.append(diff_file) + return diff_files_list + + +def get_diff_content_list(diff_content: str) -> tuple[str, str]: + pieces = diff_content.split('diff --git') + deleted_strs = '' + inserted_strs = '' + for piece in pieces: + start = False + for line in piece.splitlines(): + if line.startswith('@@'): + start = True + continue + if not start: + continue + if line.startswith('-'): + if len(line) == 1: + deleted_strs += '\n' + else: + deleted_strs += line[1:] + elif line.startswith('+'): + if len(line) == 1: + inserted_strs += '\n' + else: + inserted_strs += line[1:] + return deleted_strs, inserted_strs + + +def is_only_marks_changed(a, b, check_list): + s = SequenceMatcher(None, a, b) + for tag, i1, i2, j1, j2 in s.get_opcodes(): + if tag == 'equal': + continue + elif tag in ['delete', 'insert']: + return False + elif tag == 'replace': + deleted = ''.join(a[i1:i2]).strip() + inserted = ''.join(b[j1:j2]).strip() + if deleted not in check_list or inserted not in check_list: + return False + return True + + +class Args: + gitee_token: str + pr_owner: str + pr_repo: str + pr_number: int + siliconflow_api_key: str = "" + siliconflow_api_base: str = "https://api.siliconflow.cn/v1" + + def validate(self): + valid = self.gitee_token and self.pr_owner and self.pr_repo and self.pr_number + if not valid: + logger.error("Invalid Command Arguments") + sys.exit(1) + + +def load_config_yaml(yaml_path): + with open(yaml_path, "r", encoding="utf-8") as config_in: + data = yaml.safe_load(config_in) + + if data is None: + return None + return Config(**data) + + +def analyze_diff_files(diff_files: list[str], issue_triggers: list[IssueTrigger], + issue_title_pr_mark: str) -> tuple[int, list[str], dict]: + """ + 分析diff文件,识别需要创建issue的文件(只处理docs/zh路径下的文件,不包括同时在docs/en下修改的文件) + 返回: (文件计数, 中文文件列表, 需要创建的issue字典) + """ + file_count = 0 + zh_file = [] + need_create_issue = {} + + for trigger in issue_triggers: + for diff_file in diff_files: + # 只处理docs/zh路径下的文件 + if not diff_file.startswith('docs/zh/'): + continue + + if diff_file.startswith(trigger.trigger_pr_path) and \ + diff_file.split('.')[-1] in trigger.file_extension: + logger.info("file {} has been changed".format(diff_file)) + file_count += 1 + if "/zh" in trigger.trigger_pr_path: + need_create_issue["zh"] = [ + trigger.issue_assignee, + "{}({}).".format(trigger.issue_title, issue_title_pr_mark) + ] + # 提取相对于docs/zh/的路径 + relative_path = diff_file.replace("docs/zh/", "") + zh_file.append(relative_path) + + return file_count, zh_file, need_create_issue + + +def check_zh_files_also_modified_in_en(diff_content: str) -> list[str]: + """ + 检查哪些docs/zh文件在docs/en下也有修改 + 返回:同时在docs/zh和docs/en下修改的文件列表(相对于docs/zh/的路径) + """ + if not diff_content: + return [] + + # 获取所有diff文件 + all_diff_files = get_diff_file_list(diff_content) + + # 获取docs/zh和docs/en下的文件 + zh_files = [f.replace("docs/zh/", "") for f in all_diff_files if f.startswith("docs/zh/")] + en_files = [f.replace("docs/en/", "") for f in all_diff_files if f.startswith("docs/en/")] + + # 找出同时在zh和en下修改的文件 + zh_files_in_en = [] + for zh_file in zh_files: + if zh_file in en_files: + zh_files_in_en.append(zh_file) + logger.info(f"文件 {zh_file} 在docs/zh和docs/en下都有修改,将跳过摘要生成") + + return zh_files_in_en + + +def prepare_issue_templates(need_create_issue: dict) -> tuple[dict, list[str]]: + """ + 准备issue模板和标题列表 + """ + need_create_issue_template = {} + need_create_issue_titles = [] + for issue_item in need_create_issue: + need_create_issue_titles.append(need_create_issue[issue_item][1]) + need_create_issue_template[need_create_issue[issue_item][1]] = \ + need_create_issue[issue_item][0] + return need_create_issue_template, need_create_issue_titles + + +def generate_issue_body(issue_summary, diff_files: list[str], pr_html_url: str) -> str: + """ + 生成issue的正文内容 + """ + issue_body = "" + if issue_summary and not issue_summary.error: + issue_body += f"## 📊 变更统计\n\n" + issue_body += f"- **总文件数**: {issue_summary.total_files}\n" + issue_body += f"- **成功处理文件数**: {issue_summary.processed_files}\n" + if issue_summary.total_files != issue_summary.processed_files: + # 注意人工审查提醒 + issue_body += f"- **未处理文件数**: {issue_summary.total_files - issue_summary.processed_files}\n" + issue_body += f"- **提醒:机器人未能及时自动生成所有改动的摘要," \ + f"请注意人工审查!**\n" + if issue_summary.total_summary: + total = issue_summary.total_summary + issue_body += f"- **总改动行数**: {total.total_lines_changed}\n" + issue_body += f"- **改动类型**: {', '.join(total.change_type_list)}\n\n" + issue_body += f"## 🔍 整体变更摘要\n\n" + issue_body += f"{total.overall_summary}\n\n" + issue_body += f"## ⚠️ 整体潜在影响\n\n" + issue_body += f"{total.overall_potential_impact}\n\n" + if issue_summary.file_summaries: + issue_body += f"## 📝 单文件变更详情\n\n" + for summary in issue_summary.file_summaries: + issue_body += f"### 📁 {summary.file_path}\n\n" + issue_body += f"- **改动类型**: {summary.change_type}\n" + issue_body += f"- **新增行数**: {summary.lines_added}\n" + issue_body += f"- **删除行数**: {summary.lines_deleted}\n" + issue_body += f"- **潜在影响**: {summary.potential_impact}\n" + issue_body += f"- **详细摘要**: {summary.summary}\n\n" + issue_body += "---\n\n" + else: + issue_body += f"## ⚠️ 翻译变更检测\n\n" + issue_body += f"检测到需要翻译的文件变更,但无法获取详细摘要信息。\n\n" + issue_body += f"**变更文件数量**: {len(diff_files)}\n" + issue_body += f"**相关PR**: {pr_html_url}\n\n" + issue_body += f"## 📝 变更文件列表\n\n" + for file_path in diff_files: + issue_body += f"- {file_path}\n" + issue_body += f"\n" + + issue_body += f"## ❗️ 本Issue的摘要内容基于AI Agent技术自动生成," \ + f"仅供参考,请以实际更改为准。\n\n" + issue_body += f"## 🔗 相关PR链接\n\n" + issue_body += f"- {pr_html_url}\n" + + return issue_body + + +def filter_docs_zh_files(diff_content: str, exclude_files: list[str] = None) -> str: + """ + 过滤diff内容,只保留docs/zh路径下的文件变更 + :param exclude_files: 需要排除的文件列表(相对于docs/zh/的路径) + """ + if exclude_files is None: + exclude_files = [] + + if not diff_content: + return "" + + lines = diff_content.split('\n') + filtered_lines = [] + current_file_section = [] + in_docs_zh_file = False + current_file_path = "" + + for line in lines: + if line.startswith('diff --git'): + # 处理前一个文件 + if in_docs_zh_file and current_file_section: + # 检查当前文件是否需要排除 + relative_path = current_file_path.replace("docs/zh/", "") + if relative_path not in exclude_files: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/zh路径下的文件: {current_file_path}") + else: + logger.info(f"排除docs/zh路径下的文件(因为在en下也有修改): {current_file_path}") + + # 检查新文件是否在docs/zh路径下 + current_file_section = [line] + in_docs_zh_file = False + current_file_path = "" + + # 提取文件路径 + if ' a/' in line and ' b/' in line: + # 找到 a/ 和 b/ 的位置 + a_pos = line.find(' a/') + b_pos = line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + current_file_path = line[a_start:b_pos] + + # 检查是否在docs/zh路径下 + if current_file_path.startswith('docs/zh/'): + in_docs_zh_file = True + else: + # 继续当前文件的内容 + current_file_section.append(line) + + # 处理最后一个文件 + if in_docs_zh_file and current_file_section: + # 检查当前文件是否需要排除 + relative_path = current_file_path.replace("docs/zh/", "") + if relative_path not in exclude_files: + filtered_lines.extend(current_file_section) + logger.info(f"包含docs/zh路径下的文件: {current_file_path}") + else: + logger.info(f"排除docs/zh路径下的文件(因为在en下也有修改): {current_file_path}") + + return '\n'.join(filtered_lines) + + +def process_org_item(org_item: Org, cli: GiteeClient, pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, siliconflow_api_base: str, + pr_html_url: str, issue_title_pr_mark: str, + translation_agent_config: TranslationAgentConfig = None): + """ + 处理单个组织配置项 + """ + # 获取diff内容 + diff_content = cli.get_diff_content(pr_owner, pr_repo, pr_number) + if diff_content is None: + sys.exit(1) + + # 早期检查:查看diff中是否包含docs/zh路径下的文件变更 + if 'docs/zh/' not in diff_content: + logger.info("diff内容中不包含docs/zh路径下的文件变更,无需创建翻译issue") + return + + # 检查docs/en路径下是否有对应的文件变更 + zh_files_in_en = check_zh_files_also_modified_in_en(diff_content) + if zh_files_in_en: + logger.info(f"发现 {len(zh_files_in_en)} 个在docs/zh和docs/en下同时修改的文件:{zh_files_in_en}") + else: + logger.info("没有发现同时在docs/zh和docs/en下修改的文件") + + # 过滤只保留docs/zh路径下的文件,排除同时在docs/en下修改的文件 + filtered_diff_content = filter_docs_zh_files(diff_content, zh_files_in_en) + + # 检查是否有需要处理的docs/zh路径下的文件变更 + if not filtered_diff_content.strip(): + logger.info("没有需要处理的docs/zh路径下的文件变更,无需创建翻译issue") + return + + diff_files = get_diff_file_list(filtered_diff_content) + logger.info(f"解析出 {len(diff_files)} 个变更文件:{diff_files}") + + # 分析diff文件 + file_count, zh_file, need_create_issue = analyze_diff_files( + diff_files, org_item.issue_triggers, issue_title_pr_mark) + + logger.info(f"分析完成:共找到 {file_count} 个需要处理的文件") + + # 验证是否需要创建issue + if file_count == 0: + logger.warning( + "NOTE: https://gitee.com/{}/files change files out of translate range" + .format(issue_title_pr_mark)) + return + + # 准备issue模板 + need_create_issue_template, need_create_issue_titles = prepare_issue_templates(need_create_issue) + + if not need_create_issue_titles: + return + + # 检查issue是否已存在 + need_create_issue_list, existed_issue_list = cli.check_issue_exists( + org_item.issue_of_owner, org_item.issue_of_repo, need_create_issue_titles) + + if not need_create_issue_list: + feedback_comment = "所有相关的翻译issue已经存在,请检查: {}".format( + ", ".join(existed_issue_list)) + logger.info("Warning: " + feedback_comment) + cli.add_pr_comment(pr_owner, pr_repo, pr_number, feedback_comment) + return + + # 创建issue + for need_create_issue_item in need_create_issue_list: + # 从配置中提取参数 + backend_config = translation_agent_config.backend if translation_agent_config else {} + model_config = translation_agent_config.model if translation_agent_config else {} + processing_config = translation_agent_config.processing if translation_agent_config else {} + + # 提取具体配置值 + backend_type = backend_config.get('type', 'siliconflow') + model_name = model_config.get('name', 'Qwen/Qwen3-8B') + temperature = model_config.get('temperature', 0.1) + max_workers = processing_config.get('max_workers', 8) + single_file_timeout = processing_config.get('single_file_timeout', 180) + total_summary_timeout = processing_config.get('total_summary_timeout', 300) + max_retry = model_config.get('max_retry', 5) + max_retry_ollama = model_config.get('max_retry_ollama', 1) + + try: + # 使用过滤后的diff内容生成AI摘要 + issue_summary = get_agent_summary( + filtered_diff_content, siliconflow_api_key, siliconflow_api_base, + model_name=model_name, backend_type=backend_type, temperature=temperature, + max_workers=max_workers, single_file_timeout=single_file_timeout, + total_summary_timeout=total_summary_timeout, max_retry=max_retry, + max_retry_ollama=max_retry_ollama + ) + issue_body = generate_issue_body(issue_summary, diff_files, pr_html_url) + logger.info("AI Agent成功生成issue内容") + except Exception as e: + logger.error(f"AI Agent调用失败: {e}") + logger.info("回退到传统方式创建issue") + # 使用传统方式的简单issue body格式 + issue_body = "### Related PR link \n - {}".format(pr_html_url) + + success = cli.create_issue(org_item.issue_of_owner, org_item.issue_of_repo, + need_create_issue_item, + need_create_issue_template[need_create_issue_item], issue_body) + if success: + logger.info(f"成功创建issue: {need_create_issue_item}") + else: + logger.error(f"创建issue失败: {need_create_issue_item}") + # 添加PR评论说明创建失败 + error_comment = f"创建翻译issue失败: {need_create_issue_item},请手动创建" + cli.add_pr_comment(pr_owner, pr_repo, pr_number, error_comment) + + +def create_issue_based_on_pr_diff_and_config(conf: Config, cli: GiteeClient, + pr_owner: str, pr_repo: str, + pr_number: int, siliconflow_api_key: str, + siliconflow_api_base: str): + """ + 基于PR diff和配置创建issue的主函数 + """ + pr_html_url = "https://gitee.com/{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + issue_title_pr_mark = "{}/{}/pulls/{}".format(pr_owner, pr_repo, pr_number) + + for org_item in conf.orgs: + if org_item.org_name != pr_owner: + continue + + process_org_item(org_item, cli, pr_owner, pr_repo, pr_number, + siliconflow_api_key, siliconflow_api_base, pr_html_url, + issue_title_pr_mark, conf.translation_agent) + + + +def main(): + parser = argparse.ArgumentParser(description='Create Gitee Webhook based on the profile') + parser.add_argument('--gitee_token', type=str, required=True, help='gitee v5 api token') + parser.add_argument('--pr_owner', type=str, required=True, help='the PR of owner') + parser.add_argument('--pr_repo', type=str, required=True, help='the PR of repo') + parser.add_argument('--pr_number', type=str, required=True, help='the PR number') + parser.add_argument('--siliconflow_api_key', type=str, default="", help='the API key of siliconflow') + parser.add_argument('--siliconflow_api_base', type=str, + default="https://api.siliconflow.cn/v1", + help='the base URL of siliconflow') + args = Args() + parser.parse_args(args=sys.argv[1:], namespace=args) + args.validate() + + exec_py = sys.argv[0] + config_yaml_path = exec_py[:-2] + 'yaml' + conf = load_config_yaml(config_yaml_path) + + cli = GiteeClient(args.gitee_token) + + pr_owner = args.pr_owner + pr_repo = args.pr_repo + pr_number = int(args.pr_number) + siliconflow_api_key = args.siliconflow_api_key + siliconflow_api_base = args.siliconflow_api_base + create_issue_based_on_pr_diff_and_config(conf, cli, pr_owner, pr_repo, pr_number, + siliconflow_api_key, siliconflow_api_base) + + +if __name__ == '__main__': + main() diff --git a/ci/tools/translation/new_create_translation_issue_AI.yaml b/ci/tools/translation/new_create_translation_issue_AI.yaml new file mode 100755 index 0000000000000000000000000000000000000000..4455c57502e899e2f487bc0a90db5636d3b259b4 --- /dev/null +++ b/ci/tools/translation/new_create_translation_issue_AI.yaml @@ -0,0 +1,49 @@ +# Translation Agent Configuration +translation_agent: + # Backend Configuration + backend: + type: "siliconflow" # Options: "ollama" or "siliconflow" + # siliconflow配置现在通过命令行参数传入 + ollama: + base_url: "http://localhost:11434" + + # Model Configuration + model: + name: "Qwen/Qwen3-32B" # Options: "llama3" "Qwen/Qwen3-8B" "THUDM/GLM-4-32B-0414" or others + temperature: 0.1 + max_retry: 5 # For siliconflow backend + max_retry_ollama: 1 # For ollama backend + + # Processing Configuration + processing: + max_workers: 8 # Number of parallel workers for file processing + single_file_timeout: 180 # Timeout for single file summary generation (seconds) + total_summary_timeout: 300 # Timeout for total summary generation (seconds) + + # Logging Configuration + logging: + level: "INFO" + +# Issue Creation Configuration +orgs: + - org_name: openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、' ] + + - org_name: src-openeuler + issue_of_owner: openeuler + issue_of_repo: globalization + auto_create_issue: true + issue_triggers: + - trigger_pr_path: 'docs/zh' + issue_title: "[Auto] This is an English translation issue for the PR" + issue_assignee: judithsq + file_extension: [ doc, md, json ] + change_content_exclude: [ ',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、' ] \ No newline at end of file diff --git a/ci/tools/translation/translation.yaml b/ci/tools/translation/translation.yaml index 2dfccd2353e31c07e610a2bc33c083a1d0aaeb2a..6842cfac9972b582a9cc0cb2523a8b0b3eaadd71 100644 --- a/ci/tools/translation/translation.yaml +++ b/ci/tools/translation/translation.yaml @@ -8,14 +8,9 @@ repositories: assign_issue: - title: "[Auto] This is an English translation issue." - sign_to: judithsq -# - trigger_pr_path: 'docs/en' -# file_extension: [doc, md, json] -# assign_issue: -# - title: "[Auto] This is a Russian translation issue." -# - sign_to: judithsq exclude: - condition: only_marks_change - check_list: [',', ',', '.', '。', ';', ';', ':', ':', '"', '“', '”', '、'] + check_list: [',', ',', '.', '。', ';', ';', ':', ':', '"', '"', '"', '、'] - owner: openeuler repo: website-v2 auto_create_issue: false diff --git a/ci/tools/translation/translation_agent.py b/ci/tools/translation/translation_agent.py new file mode 100755 index 0000000000000000000000000000000000000000..dbe89469dedb32d872fe7017d50edf22c713753a --- /dev/null +++ b/ci/tools/translation/translation_agent.py @@ -0,0 +1,1318 @@ +import json +import re +import logging +import urllib.parse +from typing import List, Dict, Any, Optional, Tuple, Literal +from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed, TimeoutError as FutureTimeoutError +from pathlib import Path +import tiktoken +import sys +import time +# LangChain imports +from langchain_core.prompts import ChatPromptTemplate, PromptTemplate +from langchain_core.runnables import RunnableLambda, RunnablePassthrough +from pydantic import BaseModel, Field, SecretStr +from langchain_community.llms import Ollama +from langchain_ollama import ChatOllama +from langchain.chains import TransformChain, SequentialChain +from langchain_core.output_parsers import JsonOutputParser +from langchain_openai import ChatOpenAI +import yaml + +# ==================== 配置常量 ==================== + +# 后端类型常量 +BACKEND_TYPE_OLLAMA = "ollama" +BACKEND_TYPE_SILICONFLOW = "siliconflow" + +# 默认配置值 +DEFAULT_BACKEND_TYPE = BACKEND_TYPE_SILICONFLOW +DEFAULT_OLLAMA_BASE_URL = 'http://localhost:11434' +DEFAULT_MODEL_NAME = 'Qwen/Qwen3-8B' +DEFAULT_MODEL_TEMPERATURE = 0.1 +DEFAULT_MODEL_MAX_RETRY = 5 +DEFAULT_MODEL_MAX_RETRY_OLLAMA = 1 +DEFAULT_PROCESSING_MAX_WORKERS = 8 +DEFAULT_SINGLE_FILE_TIMEOUT = 180 +DEFAULT_TOTAL_SUMMARY_TIMEOUT = 300 +DEFAULT_LOGGING_LEVEL = 'INFO' + +# 配置日志 +logging.basicConfig(level=getattr(logging, DEFAULT_LOGGING_LEVEL.upper())) +logger = logging.getLogger(__name__) + +# ==================== 数据模型定义 ==================== + +class SingleFileSummary(BaseModel): + """单个文件摘要的结构化输出""" + file_path: str = Field(description="文件路径", default="") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", + "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + potential_impact: str = Field(description="改动对其他文件潜在的影响") + summary: str = Field(description="改动的详细摘要") + lines_added: int = Field(description="新增行数", default=0) + lines_deleted: int = Field(description="删除行数", default=0) + +class FileChangeInfo(BaseModel): + """文件改动信息""" + file_path: str = Field(description="文件路径") + change_type: Literal["仅涉及标点符号的修改", "涉及到中英文文本内容的修改", + "涉及到代码内容的修改", "涉及到其他内容的修改"] = Field(description="改动类型") + lines_changed: int = Field(description="改动行数") + +class TotalSummary(BaseModel): + """总摘要的结构化输出""" + total_files_changed: int = Field(description="总共修改的文件数量", default=0) + total_lines_changed: int = Field(description="总共修改的行数", default=0) + overall_potential_impact: str = Field(description="整体改动对其他文件潜在的影响") + overall_summary: str = Field(description="整体改动的详细摘要") + change_type_list: List[str] = Field(description="所有文件包含的改动种类列表", default=[]) + file_changes: List[FileChangeInfo] = Field(description="每个修改文件的详细信息列表", default=[]) + +@dataclass +class DiffFileInfo: + """单个文件的diff信息""" + file_path: str + diff_content: str + lines_added: int + lines_deleted: int + +@dataclass +class ProcessingResult: + """处理结果""" + file_summaries: List[SingleFileSummary] + total_summary: Optional[TotalSummary] + processed_files: int + total_files: int + error: Optional[str] = None + +# ==================== Token 统计工具 ==================== + +class TokenCounter: + def __init__(self, model_name=DEFAULT_MODEL_NAME): + self.model_name = model_name + self.prompt_tokens = 0 + self.completion_tokens = 0 + self.total_tokens = 0 + self.tokenizer = None + self._init_tokenizer() + + def _init_tokenizer(self): + """初始化tokenizer""" + try: + self.tokenizer = tiktoken.encoding_for_model(self.model_name) + except Exception: + try: + self.tokenizer = tiktoken.get_encoding("cl100k_base") + except Exception: + logger.warning("无法初始化tokenizer,将不会计算token数量") + + def _encode(self, text: str) -> List[int]: + """编码文本""" + if not isinstance(text, str): + return [] + if self.tokenizer is None: + # 如果没有tokenizer,使用简单的估算方法 + return [0] * (len(text) // 4) + try: + return self.tokenizer.encode(text) + except Exception as e: + logger.warning(f"编码文本时发生错误: {e}") + # 如果编码失败,使用简单的估算方法 + return [0] * (len(text) // 4) + + def _count_tokens(self, text: str) -> int: + """计算文本的token数量""" + return len(self._encode(text)) + + def count_prompt(self, prompt: str) -> int: + """计算prompt的token数量""" + tokens = self._count_tokens(prompt) + self.prompt_tokens += tokens + self.total_tokens += tokens + return tokens + + def count_completion(self, completion: str) -> int: + """计算completion的token数量""" + tokens = self._count_tokens(completion) + self.completion_tokens += tokens + self.total_tokens += tokens + return tokens + + def get_stats(self): + return { + "prompt_tokens": self.prompt_tokens, + "completion_tokens": self.completion_tokens, + "total_tokens": self.total_tokens + } + +# ==================== 工具函数 ==================== + +class DiffParser: + """Git Diff 解析器""" + + @staticmethod + def parse_git_diff(diff_content: str) -> List[DiffFileInfo]: + """ + 解析git diff内容,提取每个文件的改动信息 + + Args: + diff_content: git diff的原始内容 + + Returns: + 包含文件路径和对应diff内容的列表 + """ + + files = [] + current_file = None + current_diff = [] + + lines = diff_content.strip().split('\n') + + for line in lines: + # 匹配文件路径行 + if line.startswith('diff --git'): + # 保存前一个文件的信息 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + # 提取文件路径 - 改进的解析逻辑 + current_file = DiffParser._extract_file_path(line) + if current_file: + current_diff = [line] + else: + current_diff = [] + elif current_file: + current_diff.append(line) + + # 添加最后一个文件 + if current_file and current_diff: + diff_info = DiffParser._create_diff_file_info(current_file, current_diff) + if diff_info: + files.append(diff_info) + + return files + + @staticmethod + def _extract_file_path(diff_line: str) -> Optional[str]: + """ + 从git diff行中提取文件路径,支持包含汉字的文件名 + + Args: + diff_line: git diff的文件头行,格式如 "diff --git a/path/to/file b/path/to/file" + + Returns: + 提取出的文件路径,如果解析失败则返回None + """ + try: + # 方法1: 处理引号包围的路径(Git对特殊字符的处理) + # 格式: diff --git "a/path/to/file" "b/path/to/file" + quoted_pattern = r'diff --git "a/(.+?)" "b/(.+?)"' + quoted_match = re.match(quoted_pattern, diff_line) + + if quoted_match: + file_path_a = quoted_match.group(1) + file_path_b = quoted_match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法2: 使用正则表达式匹配标准的git diff格式 + # 格式: diff --git a/path/to/file b/path/to/file + pattern = r'diff --git a/(.+?) b/(.+?)(?:\s|$)' + match = re.match(pattern, diff_line) + + if match: + file_path_a = match.group(1) + file_path_b = match.group(2) + # 通常a和b路径相同,使用a路径(旧文件路径) + file_path = file_path_a + else: + # 方法3: 如果正则匹配失败,尝试更简单的解析 + # 处理可能包含空格和特殊字符的文件名 + if ' a/' in diff_line and ' b/' in diff_line: + # 找到 a/ 和 b/ 的位置 + a_pos = diff_line.find(' a/') + b_pos = diff_line.find(' b/') + + if a_pos != -1 and b_pos != -1 and a_pos < b_pos: + # 提取a/和b/之间的路径 + a_start = a_pos + 3 # 跳过 ' a/' + file_path = diff_line[a_start:b_pos] + else: + return None + else: + # 方法4: 最后的备选方案,简单的字符串分割 + parts = diff_line.split() + if len(parts) >= 3: + a_path = parts[2] + if a_path.startswith('a/'): + file_path = a_path[2:] # 移除'a/'前缀 + else: + return None + else: + return None + + # 处理文件名编码 + return DiffParser._decode_file_path(file_path) + + except Exception as e: + logger.warning(f"解析文件路径时发生错误: {e}, diff行: {diff_line}") + return None + + @staticmethod + def _decode_file_path(file_path: str) -> str: + """ + 解码文件路径,处理各种编码情况 + + Args: + file_path: 原始文件路径 + + Returns: + 解码后的文件路径 + """ + try: + # 首先尝试URL解码,处理Git编码的文件名 + decoded_path = urllib.parse.unquote(file_path, encoding='utf-8') + + # 处理Git对特殊字符的引号包装 + if decoded_path.startswith('"') and decoded_path.endswith('"'): + decoded_path = decoded_path[1:-1] + # Git使用反斜杠转义,需要处理转义序列 + decoded_path = decoded_path.replace('\\"', '"') + decoded_path = decoded_path.replace('\\\\', '\\') + + # 无论是否有引号包装,都尝试处理八进制编码 + # 检查是否包含八进制转义序列 + if '\\' in decoded_path and re.search(r'\\[0-7]{3}', decoded_path): + decoded_path = DiffParser._decode_octal_sequences(decoded_path) + + return decoded_path + + except Exception as e: + logger.warning(f"解码文件路径时发生错误: {e}, 原始路径: {file_path}") + return file_path + + @staticmethod + def _decode_octal_sequences(text: str) -> str: + """ + 解码文本中的八进制转义序列 + + Args: + text: 包含八进制转义序列的文本 + + Returns: + 解码后的文本 + """ + try: + # 查找八进制转义序列模式:\xxx + pattern = r'\\([0-7]{3})' + + # 找到所有八进制序列 + matches = list(re.finditer(pattern, text)) + if not matches: + return text + + # 收集所有字节值 + result = "" + last_end = 0 + bytes_buffer = [] + + for i, match in enumerate(matches): + # 添加匹配前的文本 + if match.start() > last_end: + # 如果有缓冲的字节,先处理它们 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + result += text[last_end:match.start()] + + # 处理当前八进制序列 + octal_str = match.group(1) + try: + byte_value = int(octal_str, 8) + bytes_buffer.append(byte_value) + except ValueError: + # 如果转换失败,添加原始字符串 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + bytes_buffer = [] + except UnicodeDecodeError: + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + result += match.group(0) + + last_end = match.end() + + # 检查是否是最后一个匹配或下一个匹配不连续 + is_last = (i == len(matches) - 1) + is_next_non_consecutive = (not is_last and + matches[i + 1].start() != match.end()) + + if is_last or is_next_non_consecutive: + # 处理缓冲的字节 + if bytes_buffer: + try: + decoded_bytes = bytes(bytes_buffer).decode('utf-8') + result += decoded_bytes + except UnicodeDecodeError: + # 如果解码失败,保持原始形式 + for byte_val in bytes_buffer: + result += f"\\{oct(byte_val)[2:].zfill(3)}" + bytes_buffer = [] + + # 添加剩余的文本 + if last_end < len(text): + result += text[last_end:] + + return result + + except Exception as e: + logger.warning(f"解码八进制序列时发生错误: {e}, 原始文本: {text}") + return text + + @staticmethod + def _create_diff_file_info(file_path: str, diff_lines: List[str]) -> Optional[DiffFileInfo]: + """创建DiffFileInfo对象""" + diff_content = '\n'.join(diff_lines) + lines_added, lines_deleted = DiffParser._count_lines_changed(diff_content) + + return DiffFileInfo( + file_path=file_path, + diff_content=diff_content, + lines_added=lines_added, + lines_deleted=lines_deleted + ) + + @staticmethod + def _count_lines_changed(diff_content: str) -> Tuple[int, int]: + """统计git diff中改动的行数""" + lines_added, lines_deleted = 0, 0 + lines = diff_content.strip().split('\n') + + for line in lines: + # 统计新增行(以+开头,但不是+++) + if line.startswith('+') and not line.startswith('+++'): + lines_added += 1 + # 统计删除行(以-开头,但不是---) + elif line.startswith('-') and not line.startswith('---'): + lines_deleted += 1 + + return lines_added, lines_deleted + +# ==================== LangChain 组件 ==================== + +class LLMFactory: + """LLM工厂类""" + + @staticmethod + def create_chat_llm(model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, siliconflow_api_key: str = "", + siliconflow_api_base: str = ""): + """创建LLM实例""" + if model_name is None: + model_name = DEFAULT_MODEL_NAME + if base_url is None: + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE + + if backend_type == BACKEND_TYPE_OLLAMA: + return ChatOllama( + model=model_name, + base_url=base_url, + temperature=temperature + ) + elif backend_type == BACKEND_TYPE_SILICONFLOW: + return ChatOpenAI( + model=model_name, + api_key=SecretStr(siliconflow_api_key), + base_url=siliconflow_api_base, + temperature=temperature + ) + else: + raise ValueError(f"不支持的后端类型: {backend_type}") + + @staticmethod + def create_llm(model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, siliconflow_api_key: str = "", + siliconflow_api_base: str = ""): + """创建LLM实例""" + if model_name is None: + model_name = DEFAULT_MODEL_NAME + if base_url is None: + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE + + if backend_type == BACKEND_TYPE_OLLAMA: + return Ollama( + model=model_name, + base_url=base_url, + temperature=temperature + ) + elif backend_type == BACKEND_TYPE_SILICONFLOW: + return ChatOpenAI( + model=model_name, + api_key=SecretStr(siliconflow_api_key), + base_url=siliconflow_api_base, + temperature=temperature + ) + else: + raise ValueError(f"不支持的后端类型: {backend_type}") + +class PromptTemplates: + """提示模板集合""" + + @staticmethod + def get_single_file_prompt() -> ChatPromptTemplate: + """获取单文件分析提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", f""" +你是一个专业的Git维护专家,擅长总结社区文档的改动,请分析以下git diff中单个文件的改动,并生成结构化的摘要。 + +请仔细分析这个文件的改动,并按照以下要求生成摘要: + +**务必注意:当你对单个文件的所有变更内容从头到尾进行过完整的分析之后,再生成你最终的结论!不要仅根据其中几行的增删改就给出你的结论!** + +1. 改动类型判断(必须选择以下四种之一,请严格按照示例进行判断): + + - "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - "仅涉及标点符号的修改":仅修改了标点符号的增减、删除、变动,几乎不影响理解 + - "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + - "涉及到中英文文本内容的修改":修改了文档内容、命令或代码注释、字符串等文本,需要对内容进行翻译或调整以使得所有语种的人都可以理解 + +**其中,你需要重点对后三种类型的修改进行区分。越靠后,修改类型判定的优先级越高。** +如果修改的内容仅仅为新增了二进制文件、新增了依赖库等其他内容,绝大部分情况都可以归类为"涉及到其他内容的修改"。 +如果修改的内容不涉及中文或英文字符且不涉及代码改动,绝大部分情况都可以归类为"仅涉及标点符号的修改",但一旦存在除了标点符号或文档格式以外的改动,则优先归为其他类别。 +如果修改的内容涉及代码逻辑、函数定义、配置结构、脚本实现等可能产生现实影响的变更,或者对环境部署命令行、内容配置进行了更改或调整,但不需要对内容进行翻译或调整以使得所有语种的人都可以理解,则归类为"涉及到代码内容的修改"。 +如果修改的内容涉及中文或英文字符,且需要对内容进行翻译或调整以使得所有语种的人都可以理解,可以归类为"涉及到中英文文本内容的修改"。 +一个区分"涉及到代码内容的修改"和"涉及到中英文文本内容的修改"的标准是:如果当前的改动属于某一语言,如果使用者不理解该语言,则必须要对改动进行翻译才能理解,则归类为"涉及到中英文文本内容的修改",否则归类为"涉及到代码内容的修改"。 + +下面我将提供几个判断示例供你参考: + +示例1 - 仅涉及标点符号的修改: +```diff +- 这是一个测试文档,用于演示功能。 ++ 这是一个测试文档,用于演示功能! +``` +分析:只变更了逗号为中文逗号,句号为感叹号,属于"仅涉及标点符号的修改" +或者文件中: +```diff +- 这个文档的功能有进一步补充的空间。 ++ 这个文档的功能有进一步补充的空间! +``` +分析:只涉及中文句号和感叹号的增删改,不涉及中文字符和英文字符的改动,且不涉及代码改动,属于"仅涉及标点符号的修改" + +示例2 - 涉及到代码内容的修改: +```diff +- function getUserInfo() ++ function getUserProfile() +``` +或者在文档的代码块中: +```diff +- ```python +- def hello(): +- print("hello") +- ``` ++ ```python ++ def greeting(): ++ print("hello world") ++ ``` +``` +或者在文档的命令行代码块中 +```diff +- pwd +- cat /etc/profile ++ sudo apt update ++ whoami ++ echo "hello" +``` +分析:修改了函数名、逻辑或文档文本中的代码块等,但是不涉及需要翻译的内容,属于"涉及到代码内容的修改" + +示例3 - 涉及到中英文文本内容的修改: +```diff +- // 这是一个注释说明 ++ // 这是一个更详细的注释说明 +``` +或者JSON中: +```diff +- "description": "用户管理模块" ++ "description": "用户账户管理模块" +``` +分析:修改了注释或文档文本内容,影响用户的阅读理解,需要对内容进行翻译或调整以使得所有语种的人都可以理解,属于"涉及到中英文文本内容的修改" + +示例4 - 涉及到其他内容的修改: +```diff ++ Binary file image.png added +``` +或者: +```diff ++ "dependencies": ++ "new-package": "^1.0.0" ++ +``` +分析:新增了二进制文件或依赖包等,属于"涉及到其他内容的修改" + +2. 潜在影响分析: + - 分析这个文件的改动可能对其他文件或整体系统造成的影响 + - 考虑依赖关系、接口变化、数据流等 + - 如果是配置文件的修改,考虑对系统配置的影响 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 详细摘要: + - 提炼出摘要改动文件所属的板块,并解释板块作用 + - 结合文件名和改动细节,用详细的语言描述具体的改动内容,要求准确全面,且改动内容要做到具体 + - 突出重要的改动点和影响范围,包括修改内容主要针对的对象、文档的分类等 + - 结合文件名、改动类型、潜在影响分析,对摘要做进一步补充 + +4. 输出格式: + - 请用中文生成摘要 + - 要求改动类型、潜在影响、改动内容总结都包含在摘要中,不能存在空字段 + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + + """), + ("human", """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} + + """) + ]) + + @staticmethod + def get_total_summary_prompt() -> ChatPromptTemplate: + """获取总摘要生成提示模板""" + return ChatPromptTemplate.from_messages([ + ("system", """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请基于以下各个文件的改动摘要,生成整个git diff的总摘要。 + +请分析所有文件的改动,并生成一个总摘要,要求: + +1. 整体改动类型统计: + - 统计所有文件涉及到的改动类型,取并集 + - 四种改动类型说明: + * "仅涉及标点符号的修改":只修改了标点符号的增减、删除、变动 + * "涉及到中英文文本内容的修改":修改了文档内容、注释等文本,但未涉及代码逻辑 + * "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + * "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - 将所有出现的改动类型都列出,不做优先级选择 + +统计示例: + +示例1 - 单一类型: +文件A:仅涉及标点符号的修改 +文件B:仅涉及标点符号的修改 +→ 整体改动类型:["仅涉及标点符号的修改"] + +示例2 - 多种类型: +文件A:仅涉及标点符号的修改 +文件B:涉及到中英文文本内容的修改 +→ 整体改动类型:["仅涉及标点符号的修改", "涉及到中英文文本内容的修改"] + +示例3 - 复杂混合: +文件A:涉及到中英文文本内容的修改 +文件B:涉及到代码内容的修改 +文件C:涉及到其他内容的修改 +→ 整体改动类型:["涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] + +2. 整体潜在影响分析: + - 逐个总结所有文件的改动内容,并进行详细的列举,尽量涵盖所有修改内容 + - 综合分析所有文件改动对系统的整体影响 + - 考虑文件间的依赖关系和系统架构影响 + - 评估改动的风险等级和影响范围 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 整体摘要详细列举: + - 提炼出所有摘要改动文件所属的板块,并解释板块作用 + - 用详细的语言分条概括每个摘要文件的核心内容,需要具体到文件,这一部分要占到最大的篇幅,不要遗漏任何摘要文件的内容 + - 突出重要的改动点,包括修改内容主要针对的对象、文档的分类等 + - 注意:整体摘要需要总结所有文件的内容;整体摘要需要尽可能详细 + +4. 输出格式: + - 请用中文生成摘要,整体摘要内容字段务必全面详细 + - 要求整体潜在影响、整体摘要都包含在摘要中,不能存在空字段 + - 整体摘要必须满足以下格式:"本次更改涉及到XXX等文件,这些文件分别属于社区中的XXX模块。涉及到XXX的修改,可能会对XXX造成影响。总的来说,这次更改主要是XXX。" + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + + + """), + ("human", """ +各个文件的改动摘要: +{file_changes} + +总文件数: {total_files} + """) + ]) + +class SingleFileAnalysisChain: + """单文件分析任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, + backend_type: str = DEFAULT_BACKEND_TYPE): + self.llm = llm + self.token_counter = token_counter + self.backend_type = backend_type + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=SingleFileSummary) + + # 根据后端类型选择不同的链构建方式 + if backend_type == BACKEND_TYPE_OLLAMA: + self.prompt = PromptTemplates.get_single_file_prompt() + self.chain = self.prompt | self.llm.with_structured_output(SingleFileSummary) + else: + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "change_type": "改动类型(必须是以下之一:仅涉及标点符号的修改、涉及到中英文文本内容的修改、涉及到代码内容的修改、涉及到其他内容的修改)", + "potential_impact": "改动对其他文件潜在的影响", + "summary": "改动的详细摘要" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请分析以下git diff中单个文件的改动,并生成结构化的摘要。 + +请仔细分析这个文件的改动,并按照以下要求生成摘要: + +**务必注意:当你对单个文件的所有变更内容从头到尾进行过完整的分析之后,再生成你最终的结论!不要仅根据其中几行的增删改就给出你的结论!** + +1. 改动类型判断(必须选择以下四种之一,请严格按照示例进行判断): + + - "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - "仅涉及标点符号的修改":仅修改了标点符号的增减、删除、变动,几乎不影响理解 + - "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + - "涉及到中英文文本内容的修改":修改了文档内容、命令或代码注释、字符串等文本,需要对内容进行翻译或调整以使得所有语种的人都可以理解 + +**其中,你需要重点对后三种类型的修改进行区分。越靠后,修改类型判定的优先级越高。** +如果修改的内容仅仅为新增了二进制文件、新增了依赖库等其他内容,绝大部分情况都可以归类为"涉及到其他内容的修改"。 +如果修改的内容不涉及中文或英文字符且不涉及代码改动,绝大部分情况都可以归类为"仅涉及标点符号的修改",但一旦存在除了标点符号或文档格式以外的改动,则优先归为其他类别。 +如果修改的内容涉及代码逻辑、函数定义、配置结构、脚本实现等可能产生现实影响的变更,或者对环境部署命令行、内容配置进行了更改或调整,但不需要对内容进行翻译或调整以使得所有语种的人都可以理解,则归类为"涉及到代码内容的修改"。 +如果修改的内容涉及中文或英文字符,且需要对内容进行翻译或调整以使得所有语种的人都可以理解,可以归类为"涉及到中英文文本内容的修改"。 +一个区分"涉及到代码内容的修改"和"涉及到中英文文本内容的修改"的标准是:如果当前的改动属于某一语言,如果使用者不理解该语言,则必须要对改动进行翻译才能理解,则归类为"涉及到中英文文本内容的修改",否则归类为"涉及到代码内容的修改"。 + +下面我将提供几个判断示例供你参考: + +示例1 - 仅涉及标点符号的修改: +```diff +- 这是一个测试文档,用于演示功能。 ++ 这是一个测试文档,用于演示功能! +``` +分析:只变更了逗号为中文逗号,句号为感叹号,属于"仅涉及标点符号的修改" +或者文件中: +```diff +- 这个文档的功能有进一步补充的空间。 ++ 这个文档的功能有进一步补充的空间! +``` +分析:只涉及中文句号和感叹号的增删改,不涉及中文字符和英文字符的改动,且不涉及代码改动,属于"仅涉及标点符号的修改" + +示例2 - 涉及到代码内容的修改: +```diff +- function getUserInfo() ++ function getUserProfile() +``` +或者在文档的代码块中: +```diff +- ```python +- def hello(): +- print("hello") +- ``` ++ ```python ++ def greeting(): ++ print("hello world") ++ ``` +``` +或者在文档的命令行代码块中 +```diff +- pwd +- cat /etc/profile ++ sudo apt update ++ whoami ++ echo "hello" +``` +分析:修改了函数名、逻辑或文档文本中的代码块等,但是不涉及需要翻译的内容,属于"涉及到代码内容的修改" + +示例3 - 涉及到中英文文本内容的修改: +```diff +- // 这是一个注释说明 ++ // 这是一个更详细的注释说明 +``` +或者JSON中: +```diff +- "description": "用户管理模块" ++ "description": "用户账户管理模块" +``` +分析:修改了注释或文档文本内容,影响用户的阅读理解,需要对内容进行翻译或调整以使得所有语种的人都可以理解,属于"涉及到中英文文本内容的修改" + +示例4 - 涉及到其他内容的修改: +```diff ++ Binary file image.png added +``` +或者: +```diff ++ "dependencies": ++ "new-package": "^1.0.0" ++ +``` +分析:新增了二进制文件或依赖包等,属于"涉及到其他内容的修改" + +2. 潜在影响分析: + - 分析这个文件的改动可能对其他文件或整体系统造成的影响 + - 考虑依赖关系、接口变化、数据流等 + - 如果是配置文件的修改,考虑对系统配置的影响 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 详细摘要: + - 提炼出摘要改动文件所属的板块,并解释板块作用 + - 结合文件名和改动细节,用详细的语言描述具体的改动内容,要求准确全面,且改动内容要做到具体 + - 突出重要的改动点和影响范围,包括修改内容主要针对的对象、文档的分类等 + - 结合文件名、改动类型、潜在影响分析,对摘要做进一步补充 + +4. 输出格式: + - 请用中文生成摘要 + - 要求改动类型、潜在影响、改动内容总结都包含在摘要中,不能存在空字段 + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + +{format_instructions} +""" + human_template = """ +文件路径: {file_path} + +Git Diff 内容: +{diff_content} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def analyze(self, diff_file_info: DiffFileInfo, + max_retry_ollama: int = DEFAULT_MODEL_MAX_RETRY_OLLAMA, + max_retry: int = DEFAULT_MODEL_MAX_RETRY) -> Optional[SingleFileSummary]: + """分析单个文件的改动""" + max_retry_count = max_retry_ollama if self.backend_type == BACKEND_TYPE_OLLAMA else max_retry + for attempt in range(1, max_retry_count + 1): + # 如果不是第一次尝试,等待一段时间再重试,避免连续失败 + if attempt > 1: + delay = min(attempt * 2, 10) # 递增延迟,最多10秒 + logger.info(f"第{attempt}次尝试分析文件 {diff_file_info.file_path},等待{delay}秒...") + time.sleep(delay) + + try: + # 构造prompt字符串 + prompt_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content + } + try: + messages = self.prompt.format_messages(**prompt_args) + if messages and len(messages) > 0: + message = messages[0] + if hasattr(message, 'content') and message.content: + prompt_str = str(message.content) + if prompt_str: + self.token_counter.count_prompt(prompt_str) + except Exception as e: + logger.warning(f"格式化prompt时发生错误: {e}") + + # 直接调用,简化超时控制 + invoke_args = { + "file_path": diff_file_info.file_path, + "diff_content": diff_file_info.diff_content, + "lines_added": diff_file_info.lines_added, + "lines_deleted": diff_file_info.lines_deleted + } + if self.backend_type != BACKEND_TYPE_OLLAMA: + invoke_args["response_format"] = {"type": "json_object"} + + result = self.chain.invoke(invoke_args) + # 验证结果有效性 + if isinstance(result, (dict, SingleFileSummary)): + if isinstance(result, dict): + result = SingleFileSummary(**result) + + # 检查结果完整性 + if result and hasattr(result, 'summary') and result.summary and result.change_type: + # 统计completion token + try: + completion_str = str(result.summary) + if completion_str: + self.token_counter.count_completion(completion_str) + except Exception as e: + logger.warning(f"计算completion tokens时发生错误: {e}") + + # 设置准确值 + result.file_path = diff_file_info.file_path + result.lines_added = diff_file_info.lines_added + result.lines_deleted = diff_file_info.lines_deleted + return result + + # 结果无效,记录并重试 + logger.warning(f"分析文件 {diff_file_info.file_path} 返回无效结果,第{attempt}次尝试") + if attempt < max_retry_count: + continue + except Exception as e: + err_str = str(e) + # 检查是否为HTTP错误(如404、5xx),常见关键字有status code、HTTP、response等 + is_http_error = False + for code in ["404", "500", "502", "503", "504"]: + if code in err_str: + is_http_error = True + break + if ("status code" in err_str or "HTTP" in err_str or "response" in err_str) and \ + any(code in err_str for code in ["404", "500", "502", "503", "504"]): + is_http_error = True + if is_http_error: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生HTTP错误: {e}," + f"第{attempt}次尝试,10秒后重试...") + if attempt < max_retry_count: + time.sleep(10) + continue + else: + logger.error(f"分析文件 {diff_file_info.file_path} 时发生错误: {e},第{attempt}次尝试") + # 其它异常直接进入下一次重试 + if attempt < max_retry_count: + logger.info(f"第{attempt}次尝试失败,准备重试...") + logger.error(f"分析文件 {diff_file_info.file_path} 连续{max_retry_count}次均未获得结构化输出," + f"放弃。") + return None + +class TotalSummaryChain: + """总摘要生成任务链""" + + def __init__(self, llm: ChatOllama | ChatOpenAI, token_counter: TokenCounter, + backend_type: str = DEFAULT_BACKEND_TYPE): + self.llm = llm + self.token_counter = token_counter + self.backend_type = backend_type + + # 创建输出解析器 + self.output_parser = JsonOutputParser(pydantic_object=TotalSummary) + + # 根据后端类型选择不同的链构建方式 + if backend_type == BACKEND_TYPE_OLLAMA: + self.prompt = PromptTemplates.get_total_summary_prompt() + self.chain = self.prompt | self.llm.with_structured_output(TotalSummary) + else: + # 为硅基流动平台添加输出格式说明 + format_instructions = """ +请以JSON格式输出,包含以下字段: +{{ + "overall_potential_impact": "整体改动对其他文件潜在的影响", + "overall_summary": "整体改动的详细摘要" +}} +""" + # 创建新的prompt模板 + system_template = """ +你是一个专业的Git维护专家,擅长总结社区文档的改动,请基于以下各个文件的改动摘要,生成整个git diff的总摘要。 + +请分析所有文件的改动,并生成一个总摘要,要求: + +1. 整体改动类型统计: + - 统计所有文件涉及到的改动类型,取并集 + - 四种改动类型说明: + * "仅涉及标点符号的修改":只修改了标点符号的增减、删除、变动 + * "涉及到中英文文本内容的修改":修改了文档内容、注释等文本,但未涉及代码逻辑 + * "涉及到代码内容的修改":修改了代码逻辑、函数定义、配置结构、命令行内容、脚本实现等 + * "涉及到其他内容的修改":新增二进制文件、新增依赖库等其他内容 + - 将所有出现的改动类型都列出,不做优先级选择 + +统计示例: + +示例1 - 单一类型: +文件A:仅涉及标点符号的修改 +文件B:仅涉及标点符号的修改 +→ 整体改动类型:["仅涉及标点符号的修改"] + +示例2 - 多种类型: +文件A:仅涉及标点符号的修改 +文件B:涉及到中英文文本内容的修改 +→ 整体改动类型:["仅涉及标点符号的修改", "涉及到中英文文本内容的修改"] + +示例3 - 复杂混合: +文件A:涉及到中英文文本内容的修改 +文件B:涉及到代码内容的修改 +文件C:涉及到其他内容的修改 +→ 整体改动类型:["涉及到中英文文本内容的修改", "涉及到代码内容的修改", "涉及到其他内容的修改"] + +2. 整体潜在影响分析: + - 逐个总结所有文件的改动内容,并进行详细的列举,尽量涵盖所有修改内容 + - 综合分析所有文件改动对系统的整体影响 + - 考虑文件间的依赖关系和系统架构影响 + - 评估改动的风险等级和影响范围 + - 如果对其他文件无潜在影响,请说明无潜在影响及原因 + +3. 整体摘要详细列举: + - 提炼出所有摘要改动文件所属的板块,并解释板块作用 + - 用详细的语言分条概括每个摘要文件的核心内容,需要具体到文件,这一部分要占到最大的篇幅,不要遗漏任何摘要文件的内容 + - 突出重要的改动点,包括修改内容主要针对的对象、文档的分类等 + - 注意:整体摘要需要总结所有文件的内容;整体摘要需要尽可能详细 + +4. 输出格式: + - 请用中文生成摘要,整体摘要内容字段务必全面详细 + - 要求整体潜在影响、整体摘要都包含在摘要中,不能存在空字段 + - 整体摘要必须满足以下格式:"本次更改涉及到XXX等文件,这些文件分别属于社区中的XXX模块。涉及到XXX的修改,可能会对XXX造成影响。总的来说,这次更改主要是XXX。" + - 严格检查你的输出,对"新增"、"删除"、"修改"等字眼要严格检查,确保没有出现语义错误 + - 严格检查你的输出,确保没有出现语义错误,对于出现的数字、改动的具体内容务必保证描述完全吻合 + +{format_instructions} +""" + human_template = """ +各个文件的改动摘要: +{file_changes} + +总文件数: {total_files} +""" + self.prompt = ChatPromptTemplate.from_messages([ + ("system", system_template.format(format_instructions=format_instructions)), + ("human", human_template) + ]) + self.chain = self.prompt | self.llm | self.output_parser + + def generate(self, file_summaries: List[SingleFileSummary], + total_summary_timeout: int = DEFAULT_TOTAL_SUMMARY_TIMEOUT) -> Optional[TotalSummary]: + """生成总摘要""" + try: + total_files = len(file_summaries) + total_lines = sum(s.lines_added + s.lines_deleted for s in file_summaries) + file_changes_info = [] + # 收集所有改动类型 + all_change_types = list(set(s.change_type for s in file_summaries)) + + for summary in file_summaries: + file_changes_info.append({ + 'file_path': summary.file_path, + 'change_type': summary.change_type, + 'potential_impact': summary.potential_impact, + 'summary': summary.summary + }) + + # 构造prompt字符串 + prompt_args = { + "file_changes": json.dumps(file_changes_info, ensure_ascii=False, indent=2), + "total_files": total_files + } + try: + messages = self.prompt.format_messages(**prompt_args) + if messages and len(messages) > 0: + message = messages[0] + if hasattr(message, 'content') and message.content: + prompt_str = str(message.content) + if prompt_str: + self.token_counter.count_prompt(prompt_str) + except Exception as e: + logger.warning(f"格式化prompt时发生错误: {e}") + + # 使用线程池执行器为总摘要生成添加超时控制 + timeout_executor = None + try: + timeout_executor = ThreadPoolExecutor(max_workers=1) + invoke_args = { + "file_changes": json.dumps(file_changes_info, ensure_ascii=False, indent=2), + "total_files": total_files, + "total_lines": total_lines + } + if self.backend_type != BACKEND_TYPE_OLLAMA: + # 为 SiliconFlow 添加 response_format 参数 + invoke_args["response_format"] = {"type": "json_object"} + + # 提交任务并设置超时 + future = timeout_executor.submit(self.chain.invoke, invoke_args) + try: + result = future.result(timeout=total_summary_timeout) + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"生成总摘要超时({total_summary_timeout}秒),放弃生成总摘要: " + f"{type(e).__name__}") + try: + future.cancel() # 尝试取消超时的任务 + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + return None + finally: + # 确保线程池被正确关闭 + if timeout_executor: + try: + timeout_executor.shutdown(wait=False) + except Exception as shutdown_e: + logger.warning(f"关闭总摘要线程池时发生错误: {shutdown_e}") + + # 处理结果 + if isinstance(result, (dict, TotalSummary)): + # 如果是dict(来自JsonOutputParser),转换为TotalSummary + if isinstance(result, dict): + result = TotalSummary(**result) + try: + if result and hasattr(result, 'overall_summary'): + summary = result.overall_summary + if summary: + completion_str = str(summary) + if completion_str: + self.token_counter.count_completion(completion_str) + except Exception as e: + logger.warning(f"计算completion tokens时发生错误: {e}") + return TotalSummary( + total_files_changed=total_files, + total_lines_changed=total_lines, + overall_potential_impact=result.overall_potential_impact, + overall_summary=result.overall_summary, + change_type_list=all_change_types, + file_changes=[ + FileChangeInfo( + file_path=summary.file_path, + change_type=summary.change_type, + lines_changed=summary.lines_added + summary.lines_deleted + ) + for summary in file_summaries + ] + ) + else: + logger.error(f"生成总摘要时返回类型错误: {type(result)}") + return None + except Exception as e: + logger.error(f"生成总摘要时发生错误: {e}") + return None + +# ==================== 主处理类 ==================== + +class GitDiffSummarizer: + """Git Diff 摘要生成器""" + + def __init__(self, siliconflow_api_key: str = "", + siliconflow_api_base: str = "https://api.siliconflow.cn/v1", + model_name: str = None, base_url: str = None, backend_type: str = None, + temperature: float = None, max_workers: int = None, + single_file_timeout: int = None, total_summary_timeout: int = None, + max_retry: int = None, max_retry_ollama: int = None): + if model_name is None: + model_name = DEFAULT_MODEL_NAME + if base_url is None: + base_url = DEFAULT_OLLAMA_BASE_URL + if backend_type is None: + backend_type = DEFAULT_BACKEND_TYPE + if temperature is None: + temperature = DEFAULT_MODEL_TEMPERATURE + if max_workers is None: + max_workers = DEFAULT_PROCESSING_MAX_WORKERS + if single_file_timeout is None: + single_file_timeout = DEFAULT_SINGLE_FILE_TIMEOUT + if total_summary_timeout is None: + total_summary_timeout = DEFAULT_TOTAL_SUMMARY_TIMEOUT + if max_retry is None: + max_retry = DEFAULT_MODEL_MAX_RETRY + if max_retry_ollama is None: + max_retry_ollama = DEFAULT_MODEL_MAX_RETRY_OLLAMA + + self.backend_type = backend_type + self.max_workers = max_workers + self.single_file_timeout = single_file_timeout + self.total_summary_timeout = total_summary_timeout + self.max_retry = max_retry + self.max_retry_ollama = max_retry_ollama + + self.token_counter = TokenCounter(model_name) + self.llm = LLMFactory.create_chat_llm(model_name, base_url, backend_type, temperature, + siliconflow_api_key, siliconflow_api_base) + self.single_file_chain = SingleFileAnalysisChain(self.llm, self.token_counter, backend_type) + self.total_summary_chain = TotalSummaryChain(self.llm, self.token_counter, backend_type) + + def cleanup(self): + """清理资源,确保程序能正确退出""" + try: + # 清理 LLM 连接 + if hasattr(self.llm, 'client') and hasattr(self.llm.client, 'close'): + self.llm.client.close() + elif hasattr(self.llm, '_client') and hasattr(self.llm._client, 'close'): + self.llm._client.close() + + # 如果是 ChatOpenAI,尝试关闭底层的 HTTP 客户端 + if self.backend_type == BACKEND_TYPE_SILICONFLOW and hasattr(self.llm, 'client'): + try: + # 强制关闭 httpx 客户端 + if hasattr(self.llm.client, '_client'): + self.llm.client._client.close() + except Exception as e: + logger.debug(f"关闭 HTTP 客户端时发生错误: {e}") + + logger.info("资源清理完成") + except Exception as e: + logger.warning(f"清理资源时发生错误: {e}") + + def process_git_diff(self, diff_content: str, max_workers: int = None) -> ProcessingResult: + if max_workers is None: + max_workers = self.max_workers + + logger.info("开始解析git diff...") + files = DiffParser.parse_git_diff(diff_content) + logger.info(f"解析到 {len(files)} 个文件的改动") + if not files: + logger.warning("未找到任何文件改动") + return ProcessingResult( + file_summaries=[], + total_summary=None, + processed_files=0, + total_files=0, + error='未找到任何文件改动' + ) + logger.info("开始并行处理各个文件的改动...") + file_summaries = [] + # 使用更健壮的并发处理机制 + executor = None + try: + executor = ThreadPoolExecutor(max_workers=max_workers) + future_to_file = { + executor.submit(self.single_file_chain.analyze, file_info, self.max_retry_ollama, self.max_retry): file_info.file_path + for file_info in files + } + + # 设置更长的整体超时时间,避免与单个文件超时冲突 + overall_timeout = self.single_file_timeout * len(files) + 600 # 给每个文件的时间 + 额外缓冲 + + completed_count = 0 + total_count = len(future_to_file) + + try: + for future in as_completed(future_to_file, timeout=overall_timeout): + file_path = future_to_file[future] + completed_count += 1 + try: + summary = future.result(timeout=5) # 短暂缓冲时间,因为任务已经完成 + if summary: + file_summaries.append(summary) + logger.info(f"完成文件 {file_path} 的摘要生成 " + f"({completed_count}/{total_count})") + else: + logger.warning(f"文件 {file_path} 的摘要生成失败 " + f"({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as e: + logger.error(f"文件 {file_path} 的摘要获取超时,跳过该文件: " + f"{type(e).__name__} ({completed_count}/{total_count})") + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消任务时发生错误: {cancel_e}") + except Exception as e: + logger.error(f"处理文件 {file_path} 时发生异常: {e} ({completed_count}/{total_count})") + except (FutureTimeoutError, TimeoutError) as overall_e: + logger.error(f"整体处理超时({overall_timeout}秒)," + f"已完成{completed_count}/{total_count}个文件") + # 取消所有未完成的任务 + for future in future_to_file: + if not future.done(): + try: + future.cancel() + except Exception as cancel_e: + logger.warning(f"取消未完成任务时发生错误: {cancel_e}") + finally: + # 确保线程池被正确关闭 + if executor: + try: + executor.shutdown(wait=True) + except Exception as shutdown_e: + logger.warning(f"关闭主线程池时发生错误: {shutdown_e}") + logger.info(f"成功生成 {len(file_summaries)} 个文件的摘要") + logger.info("开始生成总摘要...") + total_summary = None + if file_summaries: + logger.info(f"基于 {len(file_summaries)} 个成功处理的文件生成总摘要...") + try: + total_summary = self.total_summary_chain.generate(file_summaries, self.total_summary_timeout) + if total_summary: + logger.info("总摘要生成成功") + else: + logger.warning("总摘要生成失败") + except Exception as e: + logger.error(f"生成总摘要时发生未预期的错误: {e}") + else: + logger.warning("没有成功处理的文件,跳过总摘要生成") + return ProcessingResult( + file_summaries=file_summaries, + total_summary=total_summary, + processed_files=len(file_summaries), + total_files=len(files) + ) + +# ==================== 主函数 ==================== + +def get_agent_summary(sample_diff, siliconflow_api_key="", + siliconflow_api_base="https://api.siliconflow.cn/v1", + model_name=None, base_url=None, backend_type=None, temperature=None, + max_workers=None, single_file_timeout=None, total_summary_timeout=None, + max_retry=None, max_retry_ollama=None): + + summarizer = GitDiffSummarizer(siliconflow_api_key, siliconflow_api_base, model_name, base_url, + backend_type, temperature, max_workers, single_file_timeout, + total_summary_timeout, max_retry, max_retry_ollama) + result = None + try: + result = summarizer.process_git_diff(sample_diff) + finally: + # 确保在函数退出前清理资源 + summarizer.cleanup() + + if not result: + print("处理失败,无法获取结果") + return None + + if result.error: + print(f"错误: {result.error}") + print("\n=== 单文件摘要 ===") + for summary in result.file_summaries: + print(f"文件: {summary.file_path}") + print(f"改动类型: {summary.change_type}") + print(f"新增行数: {summary.lines_added}") + print(f"删除行数: {summary.lines_deleted}") + print(f"潜在影响: {summary.potential_impact}") + print(f"摘要: {summary.summary}") + print("-" * 50) + print("=== 处理结果 ===") + print(f"总文件数: {result.total_files}") + print(f"成功处理文件数: {result.processed_files}") + if result.total_summary: + print("\n=== 总摘要 ===") + total = result.total_summary + print(f"总文件数: {total.total_files_changed}") + print(f"总改动行数: {total.total_lines_changed}") + print(f"改动类型列表: {total.change_type_list}") + print(f"整体潜在影响: {total.overall_potential_impact}") + print(f"整体摘要: {total.overall_summary}") + print("\n=== 文件改动列表 ===") + for file_change in total.file_changes: + print(f"- {file_change.file_path}: {file_change.change_type} ({file_change.lines_changed} 行)") + + # 输出token统计 + stats = summarizer.token_counter.get_stats() + print("\n=== Token消耗统计 ===") + print(f"Prompt tokens: {stats['prompt_tokens']}") + print(f"Completion tokens: {stats['completion_tokens']}") + print(f"Total tokens: {stats['total_tokens']}") + return result + +if __name__ == "__main__": + # 微服务接口逻辑模拟: 传递进来的就是 sample_diff 的内容 + sample_diff = sys.argv[1] + result = get_agent_summary(sample_diff) + print(result) \ No newline at end of file