master

分支 (1)

标签 (2)

管理

管理

master

0.3.1

0.3.0

gitee-ai-docs-mr100
/
utils.py

import requests
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from uuid import uuid4
from langchain_chroma import Chroma
import html2text


def is_port_open(url):
    try:
        response = requests.get(url)
        return True
    except requests.ConnectionError:
        return False


def get_request_json(url):
    try:
        response = requests.get(url)
        return response.json()
    except:
        return {}


# def load_network_data():
#     try:
#         docs_serverless_api_list = get_request_json(
#             "https://ai.gitee.com/api/pay/services?type=serverless&page=1&size=200")
#         docs_serverless_api_list['title'] = 'Serverless API 列表 - Gitee - AI。包含名称、价格、描述、链接等信息。'
#         for item in docs_serverless_api_list['items']:
#             item.pop('props', None)
#         if not isinstance(docs_serverless_api_list, list):
#             docs_serverless_api_list = [docs_serverless_api_list]
#         with open('./data/serverless_api_list.json', 'w', encoding='utf-8') as json_file:
#             json.dump(docs_serverless_api_list, json_file,
#                       ensure_ascii=False, indent=1)
#     except Exception as e:
#         print(f"载入外部数据失败{e}，跳过")


def html2md(html_json_path, md_json_path):
    """playwright 抓取的 html 转换为 markdown"""
    h = html2text.HTML2Text()
    h.ignore_links = False
    h.mark_code = True
    h.escape_all = True
    h.wrap_tables = False
    h.pad_tables = False
    # h.ignore_tables = True
    h.body_width = 0
    h.wrap_links = False
    h.unicode_snob = True

    modified_data = []
    with open(html_json_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    for item in data:
        new_item = {
            "标题": item['标题'],
            "链接": item['链接'],
            "md": h.handle(item["html"])
        }
        modified_data.append(new_item)
    with open(md_json_path, 'w', encoding='utf-8') as file:
        json.dump(modified_data, file, ensure_ascii=False, indent=1)


def split_md_on_header(markdown_document):
    """按标题拆分 markdown 为更小的块"""
    headers_to_split_on = [
        ("#", "1"),
        ("##", "2"),
        ("###", "3"),
    ]

    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on, strip_headers=False, return_each_line=False)

    return markdown_splitter.split_text(markdown_document)


def create_res_document(md_json_path):
    docs = []
    with open(md_json_path, 'r', encoding='utf-8') as file:
        md_list = json.load(file)
    for md in md_list:
        if (len(md['md']) < 1500):
            docs.append(Document(page_content=md['md'], metadata={
                        "链接": md['链接'], "标题": md['标题']}))
        else:
            print(md['标题'], "md 长度大于 1500 字符 将按标题拆分")
            split_md_list = split_md_on_header(md['md'])
            for split_md_item in split_md_list:
                docs.append(Document(page_content=split_md_item.page_content, metadata={
                    "链接": md['链接'], "标题": md['标题']}))
    return docs