1 Star 0 Fork 8

衣沾不足惜/Gitee AI 文档客服 - 马建仓

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
utils.py 3.17 KB
一键复制 编辑 原始数据 按行查看 历史
import requests
import json
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from uuid import uuid4
from langchain_chroma import Chroma
import html2text
def is_port_open(url):
try:
response = requests.get(url)
return True
except requests.ConnectionError:
return False
def get_request_json(url):
try:
response = requests.get(url)
return response.json()
except:
return {}
# def load_network_data():
# try:
# docs_serverless_api_list = get_request_json(
# "https://ai.gitee.com/api/pay/services?type=serverless&page=1&size=200")
# docs_serverless_api_list['title'] = 'Serverless API 列表 - Gitee - AI。包含名称、价格、描述、链接等信息。'
# for item in docs_serverless_api_list['items']:
# item.pop('props', None)
# if not isinstance(docs_serverless_api_list, list):
# docs_serverless_api_list = [docs_serverless_api_list]
# with open('./data/serverless_api_list.json', 'w', encoding='utf-8') as json_file:
# json.dump(docs_serverless_api_list, json_file,
# ensure_ascii=False, indent=1)
# except Exception as e:
# print(f"载入外部数据失败{e},跳过")
def html2md(html_json_path, md_json_path):
"""playwright 抓取的 html 转换为 markdown"""
h = html2text.HTML2Text()
h.ignore_links = False
h.mark_code = True
h.escape_all = True
h.wrap_tables = False
h.pad_tables = False
# h.ignore_tables = True
h.body_width = 0
h.wrap_links = False
h.unicode_snob = True
modified_data = []
with open(html_json_path, 'r', encoding='utf-8') as file:
data = json.load(file)
for item in data:
new_item = {
"标题": item['标题'],
"链接": item['链接'],
"md": h.handle(item["html"])
}
modified_data.append(new_item)
with open(md_json_path, 'w', encoding='utf-8') as file:
json.dump(modified_data, file, ensure_ascii=False, indent=1)
def split_md_on_header(markdown_document):
"""按标题拆分 markdown 为更小的块"""
headers_to_split_on = [
("#", "1"),
("##", "2"),
("###", "3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on, strip_headers=False, return_each_line=False)
return markdown_splitter.split_text(markdown_document)
def create_res_document(md_json_path):
docs = []
with open(md_json_path, 'r', encoding='utf-8') as file:
md_list = json.load(file)
for md in md_list:
if (len(md['md']) < 1500):
docs.append(Document(page_content=md['md'], metadata={
"链接": md['链接'], "标题": md['标题']}))
else:
print(md['标题'], "md 长度大于 1500 字符 将按标题拆分")
split_md_list = split_md_on_header(md['md'])
for split_md_item in split_md_list:
docs.append(Document(page_content=split_md_item.page_content, metadata={
"链接": md['链接'], "标题": md['标题']}))
return docs
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/stringify/gitee-ai-docs-mr100.git
git@gitee.com:stringify/gitee-ai-docs-mr100.git
stringify
gitee-ai-docs-mr100
Gitee AI 文档客服 - 马建仓
master

搜索帮助