代码拉取完成,页面将自动刷新
同步操作将从 衣沾不足惜/gitee-ai-docs 强制同步,此操作会覆盖自 Fork 仓库以来所做的任何修改,且无法恢复!!!
确定后同步将在后台操作,完成时将刷新页面,请耐心等待。
import requests
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from uuid import uuid4
from langchain_chroma import Chroma
import html2text
def is_port_open(url):
try:
response = requests.get(url)
return True
except requests.ConnectionError:
return False
def get_request_json(url):
try:
response = requests.get(url)
return response.json()
except:
return {}
# def load_network_data():
# try:
# docs_serverless_api_list = get_request_json(
# "https://ai.gitee.com/api/pay/services?type=serverless&page=1&size=200")
# docs_serverless_api_list['title'] = 'Serverless API 列表 - Gitee - AI。包含名称、价格、描述、链接等信息。'
# for item in docs_serverless_api_list['items']:
# item.pop('props', None)
# if not isinstance(docs_serverless_api_list, list):
# docs_serverless_api_list = [docs_serverless_api_list]
# with open('./data/serverless_api_list.json', 'w', encoding='utf-8') as json_file:
# json.dump(docs_serverless_api_list, json_file,
# ensure_ascii=False, indent=1)
# except Exception as e:
# print(f"载入外部数据失败{e},跳过")
def html2md(html_json_path, md_json_path):
"""playwright 抓取的 html 转换为 markdown"""
h = html2text.HTML2Text()
h.ignore_links = False
h.mark_code = True
h.escape_all = True
h.wrap_tables = False
h.pad_tables = False
# h.ignore_tables = True
h.body_width = 0
h.wrap_links = False
h.unicode_snob = True
modified_data = []
with open(html_json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for item in data:
new_item = {
"标题": item['标题'],
"链接": item['链接'],
"md": h.handle(item["html"])
}
modified_data.append(new_item)
with open(md_json_path, 'w', encoding='utf-8') as file:
json.dump(modified_data, file, ensure_ascii=False, indent=1)
def split_md_on_header(markdown_document):
"""按标题拆分 markdown 为更小的块"""
headers_to_split_on = [
("#", "1"),
("##", "2"),
("###", "3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on, strip_headers=False, return_each_line=False)
return markdown_splitter.split_text(markdown_document)
def create_res_document(md_json_path):
docs = []
with open(md_json_path, 'r', encoding='utf-8') as file:
md_list = json.load(file)
for md in md_list:
if (len(md['md']) < 1500):
docs.append(Document(page_content=md['md'], metadata={
"链接": md['链接'], "标题": md['标题']}))
else:
print(md['标题'], "md 长度大于 1500 字符 将按标题拆分")
split_md_list = split_md_on_header(md['md'])
for split_md_item in split_md_list:
docs.append(Document(page_content=split_md_item.page_content, metadata={
"链接": md['链接'], "标题": md['标题']}))
return docs
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。