From 5a3fc5467d0adbf3182c7557cf9958a8c397d558 Mon Sep 17 00:00:00 2001 From: TomNewChao <353712216@qq.com> Date: Thu, 4 May 2023 15:40:53 +0800 Subject: [PATCH] doc-ci add url check --- src/docs-ci/README.md | 14 +++ src/docs-ci/__init__.py | 5 + src/docs-ci/doc-ci.py | 141 ++++++++++++++++++++++ src/docs-ci/doc-daily-build.py | 213 +++++++++++++++++++++++++++++++++ src/docs-ci/ignore_url.yaml | 128 ++++++++++++++++++++ 5 files changed, 501 insertions(+) create mode 100644 src/docs-ci/README.md create mode 100644 src/docs-ci/__init__.py create mode 100644 src/docs-ci/doc-ci.py create mode 100644 src/docs-ci/doc-daily-build.py create mode 100644 src/docs-ci/ignore_url.yaml diff --git a/src/docs-ci/README.md b/src/docs-ci/README.md new file mode 100644 index 00000000..2af1afd3 --- /dev/null +++ b/src/docs-ci/README.md @@ -0,0 +1,14 @@ +doc-daily-build.py +安装 +pip3 install -y gevent click pyyaml +执行 +python3 doc-ci.py --repo=https://gitee.com/opengauss/docs.git + + + +doc-ci.py +安装 +pip3 install -y click +执行 +python3 doc-ci.py --pr_id=1 + diff --git a/src/docs-ci/__init__.py b/src/docs-ci/__init__.py new file mode 100644 index 00000000..d49ec230 --- /dev/null +++ b/src/docs-ci/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/4/21 15:16 +# @Author : Tom_zc +# @FileName: __init__.py.py +# @Software: PyCharm diff --git a/src/docs-ci/doc-ci.py b/src/docs-ci/doc-ci.py new file mode 100644 index 00000000..b6f1c801 --- /dev/null +++ b/src/docs-ci/doc-ci.py @@ -0,0 +1,141 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/4/21 15:16 +# @Author : Tom_zc +# @FileName: doc-ci.py +# @Software: PyCharm +import click +import requests +import time +import re +import sys +import traceback +from functools import wraps + + +class Config: + reg = r"(https:\/\/|http:\/\/|ftp:\/\/)([\w\-\.@?^=%&:\!/~\+#]*[\w\-\@?^=%&/~\+#])?" + ignore_file_suffix = [ + ".github", ".gitignore", ".git", ".png", ".jpg", ".woff2", ".woff", ".svg", ".gif", ".JPG" + ] + pr_info_url = "https://gitee.com/opengauss/docs/pulls/{}.diff" + header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"} + + +def func_retry(tries=3, delay=1): + + def deco_retry(fn): + @wraps(fn) + def inner(*args, **kwargs): + for i in range(tries): + try: + return fn(*args, **kwargs) + except Exception as e: + print("func_retry:{} e:{} traceback: {}".format(fn.__name__, e, traceback.format_exc())) + time.sleep(delay) + else: + raise RuntimeError("func_retry:{} over tries, failed".format(fn.__name__)) + + return inner + + return deco_retry + + +def extract_url(content): + url_list = re.findall(Config.reg, content) + new_url_list = list() + for url in url_list: + url = "".join(url) + print("check:{}".format(url)) + if url in [r"http://", r"https://"]: + continue + new_url_list.append(url) + return new_url_list + + +@func_retry(tries=3, delay=2) +def request_pr_info(url): + ret = requests.get(url, headers=Config.header, timeout=(30, 30)) + if not str(ret.status_code).startswith("2") and not str(ret.status_code).startswith("3"): + raise Exception("request pr info failed: {}--->{}".format(url, ret.status_code)) + else: + return ret.content + + +def parse_pr_info(content): + url_list = list() + content = content.decode("utf-8") + content_list = content.split("diff --git") + for content_temp in content_list: + path = content_temp.split("\n", maxsplit=1)[0].split(r" ")[-1][1:] + if path.startswith(r"b/content"): + for ignore_file in Config.ignore_file_suffix: + if ignore_file in path: + break + else: + url_list.extend(extract_url(content_temp)) + return list(set(url_list)) + + +def check_url(url_list): + failure_list = list() + for url in url_list: + ret = None + try: + ret = requests.head(url, headers=Config.header, timeout=(30, 30)) + if not str(ret.status_code).startswith("2") and not str(ret.status_code).startswith("3"): + ret.close() + ret = requests.get(url, headers=Config.header, timeout=(30, 30)) + if not str(ret.status_code).startswith("2") and not str(ret.status_code).startswith("3"): + raise Exception("find the dead link:{}--->{}".format(url, ret.status_code)) + else: + print("check valid url:{}".format(url)) + else: + print("check valid url:{}".format(url)) + except Exception as e: + print("check_url--->url:{}, e:{}".format(url, e)) + failure_list.append(url) + finally: + try: + if ret: + ret.close() + except Exception as e: + print(e) + return failure_list + + +def worker(pr_id): + print("----------------1.start to check pr-----------------------------") + try: + url = Config.pr_info_url.format(pr_id) + pr_content = request_pr_info(url) + url_list = parse_pr_info(pr_content) + print("check url:{}".format(url_list)) + failed_list = check_url(url_list) + return failed_list + except Exception as e: + raise RuntimeError("doc-ci--->pr_id: {}, e:{}, detail:{}".format(pr_id, str(e), traceback.format_exc())) + + +def print_content(failed_list): + print("----------------2.start to output lose efficacy-----------------") + for url in failed_list: + content = "check dead_link, the url is:{}\n".format(url) + print(content) + + +@click.command() +@click.option("--pr_id", help="the pr_id of git") +def main(pr_id): + if not pr_id: + raise RuntimeError("invalid pr_id") + start_time = time.time() + failed_list = worker(pr_id) + print_content(failed_list) + end_time = time.time() + print("----------------spend time:{}-----".format(end_time - start_time)) + if failed_list: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/src/docs-ci/doc-daily-build.py b/src/docs-ci/doc-daily-build.py new file mode 100644 index 00000000..03c203cb --- /dev/null +++ b/src/docs-ci/doc-daily-build.py @@ -0,0 +1,213 @@ +# -*- coding: utf-8 -*- +# @Time : 2023/4/24 18:56 +# @Author : Tom_zc +# @FileName: doc-daily-build.py +# @Software: PyCharm +import sys +from functools import wraps + +import gevent +from gevent.pool import Pool +from gevent import monkey + +monkey.patch_all(thread=False) + +import os +import re +import click +import subprocess +import requests +import shutil +import time +import traceback +import yaml + +ALL_FAILURE = dict() + + +class Config: + cmd = "cd /root && git clone {} > /dev/null" + path = "/root/{}" + relatively_path = "/root/" + reg = r"(https:\/\/|http:\/\/|ftp:\/\/)([\w\-\.@?^=%&:\!/~\+#]*[\w\-\@?^=%&/~\+#])?" + ignore_file_suffix = [ + ".github", ".gitignore", ".git", ".png", ".jpg", ".woff2", ".woff", ".svg", ".gif", ".JPG" + ] + ignore_file_dict = None + + +def func_retry(tries=3, delay=1): + def deco_retry(fn): + @wraps(fn) + def inner(*args, **kwargs): + for i in range(tries): + try: + return fn(*args, **kwargs) + except Exception as e: + print("func_retry:{} e:{} traceback: {}".format(fn.__name__, e, traceback.format_exc())) + time.sleep(delay) + else: + raise RuntimeError("func_retry:{} over tries, failed".format(fn.__name__)) + + return inner + + return deco_retry + + +def load_ignore_url(file_path="ignore_url.yaml", method="load"): + if Config.ignore_file_dict is None: + yaml_load_method = getattr(yaml, method) + with open(file_path, "r", encoding="utf-8") as file: + doc_list = yaml_load_method(file, Loader=yaml.FullLoader) + ignore_url_dict = dict() + for doc_temp in doc_list: + ignore_url_dict[doc_temp["path"]] = doc_temp["url"] + Config.ignore_file_dict = ignore_url_dict + return Config.ignore_file_dict + + +def prepare_env(filename): + path = Config.path.format(filename) + if os.path.exists(path): + shutil.rmtree(path) + + +def execute_cmd(cmd, timeout=600): + try: + p = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True, close_fds=True) + t_wait_seconds = 0 + while True: + if p.poll() is not None: + break + if timeout >= 0 and t_wait_seconds >= (timeout * 100): + p.terminate() + return -1, "", "execute_cmd exceeded time {0} seconds in executing: {1}".format(timeout, cmd) + time.sleep(0.01) + t_wait_seconds += 1 + out, err = p.communicate() + ret = p.returncode + return ret, out, err + except Exception as e: + return -1, "", "execute_cmd exceeded raise, e={0}, trace={1}".format(e.args[0], traceback.format_exc()) + + +def parse_file(path): + with open(path, "r", encoding="utf-8") as f: + return f.read() + + +def extract_url(content): + url_list = re.findall(Config.reg, content) + new_url_list = list() + for url in url_list: + url = "".join(url) + if url in [r"http://", r"https://"]: + continue + new_url_list.append(url) + return list(set(new_url_list)) + + +def get_url(abs_path, url_list): + failure_list = list() + for url in url_list: + ret = None + try: + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:83.0) Gecko/20100101 Firefox/83.0"} + ret = requests.head(url, headers=headers, timeout=(30, 30)) + if not str(ret.status_code).startswith("2") and not str(ret.status_code).startswith("3"): + ret.close() + ret = requests.get(url, headers=headers, timeout=(30, 30)) + if not str(ret.status_code).startswith("2") and not str(ret.status_code).startswith("3"): + raise Exception("find the dead link:{}--->{}".format(url, ret.status_code)) + except Exception as e: + print("Url--->path:{}, url:{}, e:{}".format(abs_path, url, e)) + failure_list.append(url) + finally: + try: + if ret: + ret.close() + except Exception as e: + print(e) + + return failure_list + + +def worker(abs_path): + relatively_path = abs_path.split(Config.relatively_path)[-1] + try: + content = parse_file(abs_path) + url_list = extract_url(content) + failure_list = get_url(abs_path, url_list) + if failure_list: + ignore_doc_dict = load_ignore_url() + if relatively_path in ignore_doc_dict.keys(): + failure_list = list(set(failure_list) - set(ignore_doc_dict[relatively_path])) + if failure_list: + ALL_FAILURE.update({relatively_path: failure_list}) + except Exception as e: + print("Task--->path: {}, err:{}".format(relatively_path, e)) + + +@func_retry() +def download_rep(repo_name, repo): + print("----------------1.start to git clone:{}-----------------".format(repo)) + prepare_env(repo_name) + cmd = Config.cmd.format(repo) + code, std_out, std_err = execute_cmd(cmd) + if code != 0: + raise RuntimeError(std_err) + return std_out + + +def list_file(repo_name, lookup_dir): + print("----------------2.start to list repo:{}-----------------".format(repo_name)) + path = Config.path.format(repo_name) + if lookup_dir: + path = os.path.join(path, lookup_dir) + file_list = list() + for dir_path, _, filenames in os.walk(path): + for filename in filenames: + abs_path = os.path.join(dir_path, filename) + is_ignore_file = False + for file_suffix in Config.ignore_file_suffix: + if file_suffix in abs_path: + is_ignore_file = True + break + if is_ignore_file: + continue + file_list.append(abs_path) + coroutine_pool = Pool(200) + coroutine_task = [coroutine_pool.spawn(worker, i) for i in file_list] + gevent.joinall(coroutine_task) + + +def print_content(): + print("----------------3.start to output invalid url-----------------") + for path, url_list in ALL_FAILURE.items(): + for url in url_list: + content = "check dead_link, the path is :{}, the url is:{}\n".format(path, url) + print(content) + + +@click.command() +@click.option("--repo", help="the repository of git", default="https://gitee.com/opengauss/docs.git") +def main(repo): + start_time = time.time() + repo_name = repo.split(r"/")[-1].split(".")[0] + if not len(repo_name) or r".." in repo_name: + raise RuntimeError("The params of repository is invalid") + if repo_name == "docs": + lookup_dir = "content" + else: + lookup_dir = "" + download_rep(repo_name, repo) + list_file(repo_name, lookup_dir) + print_content() + end_time = time.time() + print("-----spend time:{}-----".format(end_time - start_time)) + if ALL_FAILURE: + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/src/docs-ci/ignore_url.yaml b/src/docs-ci/ignore_url.yaml new file mode 100644 index 00000000..c69c66ea --- /dev/null +++ b/src/docs-ci/ignore_url.yaml @@ -0,0 +1,128 @@ +- path: docs/content/zh/docs/AIFeatureGuide/service子命令.md + url: + - https://127.0.0.1:9187 +- path: docs/content/zh/docs/DataMigrationGuide/数据校验.md + url: + - http://localhost:8081 + - http://127.0.0.1:9000 + - http://127.0.0.1:9001 + - http://127.0.0.1:9002 +- path: docs/content/docs-lite/zh/docs/DatabaseReference/安全配置.md + url: + - https://127.0.0.1 +- path: docs/content/zh/docs/SQLReference/解析器.md + url: + - http://example.com/stuff/index.html +- path: docs/content/docs-lite/en/docs/DataBaseReference/security-configuration.md + url: + - https://127.0.0.1 +- path: docs/content/docs-lite/zh/docs/SQLReference/解析器.md + url: + - http://example.com/stuff/index.html +- path: docs/content/zh/docs/ToolandCommandReference/kdestroy.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/zh/docs/ToolandCommandReference/kadmin-local.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/zh/docs/ToolandCommandReference/klist.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/zh/docs/ToolandCommandReference/kdb5_util.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/zh/docs/ToolandCommandReference/krb5kdc.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/zh/docs/ToolandCommandReference/kinit.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/docs-lite/en/docs/SQLReference/parser.md + url: + - http://example.com/stuff/index.html +- path: docs/content/zh/docs/ToolandCommandReference/特性介绍.md + url: + - http://ip:port/CMRestAPI/keyword + - http://ip:port/CMRestAPI/ClusterStatus + - http://ip:port/CMRestAPI/MasterInfo + - http://ip:port/CMRestAPI + - http://ip:port/CMRestAPI/RecvAddr + - http://ip:port/CMRestAPI/NodeStatus +- path: docs/content/docs-lite/en/docs/AboutopenGauss/preparations.md + url: + - https://opengauss.obs.cn-south-1.myhuaweicloud.com/3.0.0/openGauss-third +- path: docs/content/zh/docs/AIFeatureGuide/DBMind部署.md + url: + - http://localhost:9187/metrics +- path: docs/content/en/docs/AIFeatureGuide/service.md + url: + - https://127.0.0.1:9187 +- path: docs/content/en/docs/SQLReference/parser.md + url: + - http://example.com/stuff/index.html +- path: docs/content/zh/docs/PerformanceTuningGuide/测试MOT-TPCC性能.md + url: + - https://osdn.net/frs/g +- path: docs/content/zh/docs/DatabaseAdministrationGuide/MOT样例TPC-C基准.md + url: + - https://osdn.net/frs/g +- path: docs/content/en/docs/ToolandCommandReference/kdestroy.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/ToolandCommandReference/krb5kdc.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/ToolandCommandReference/kadmin-local.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/ToolandCommandReference/klist.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/ToolandCommandReference/kinit.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/ToolandCommandReference/kdb5_util.md + url: + - https://web.mit.edu/kerberos/krb5-1.17/doc/admin/admin +- path: docs/content/en/docs/DataMigrationGuide/data-check.md + url: + - https://opengauss.obs.cn-south-1.myhuaweicloud.com/latest/tools/openGauss-datachecker-performance-5.0.0.tar.gz +- path: docs/content/en/docs/DataMigrationGuide/full-migration.md + url: + - https://opengauss.obs.cn-south-1.myhuaweicloud.com/latest/chameleon/chameleon-1.0.0-py3-none-any.whl +- path: docs/content/docs-lite/zh/docs/DatabaseAdministrationGuide/MOT样例TPC-C基准.md + url: + - https://osdn.net/frs/g +- path: docs/content/docs-lite/zh/docs/PerformanceTuningGuide/测试MOT-TPCC性能.md + url: + - https://osdn.net/frs/g +- path: docs/content/docs-lite/en/docs/PerformanceTuningGuide/testing-mot-tpcc-performance.md + url: + - https://osdn.net/frs/g +- path: docs/content/en/docs/PerformanceTuningGuide/testing-mot-tpc-c-performance.md + url: + - https://osdn.net/frs/g +- path: docs/content/en/docs/DatabaseAdministrationGuide/mot-sample-tpc-c-benchmark.md + url: + - https://osdn.net/frs/g +- path: docs/content/docs-lite/en/docs/DatabaseAdministrationGuide/mot-sample-tpc-c-benchmark.md + url: + - https://osdn.net/frs/g +- path: docs/content/zh/docs/ExtensionReference/PostGIS安装.md + url: + - https://gitee.com/opengauss/openGauss-third +- path: docs/content/en/docs/ExtensionReference/installing-postgis.md + url: + - https://gitee.com/opengauss/openGauss-third +- path: docs/content/docs-lite/zh/docs/ReleaseNotes/源代码.md + url: + - https://gitee.com/opengauss/openGauss-third +- path: docs/content/docs-lite/en/docs/Releasenotes/source-code.md + url: + - https://gitee.com/opengauss/openGauss-third +- path: docs/content/zh/docs/ReleaseNotes/源代码.md + url: + - https://gitee.com/opengauss/openGauss-third +- path: docs/content/en/docs/ReleaseNotes/source-code.md + url: + - https://gitee.com/opengauss/openGauss-third \ No newline at end of file -- Gitee