diff --git a/data_chain/manager/chunk_manager.py b/data_chain/manager/chunk_manager.py index a3488aba3813fb2106b5bcf0dcd55cc97294ecd3..602eb165bbe017abd3fab9b41379039fde5c6efa 100644 --- a/data_chain/manager/chunk_manager.py +++ b/data_chain/manager/chunk_manager.py @@ -1,5 +1,5 @@ # Copyright (c) Huawei Technologies Co., Ltd. 2023-2024. All rights reserved. -from sqlalchemy import select, update, func, text, or_, and_, bindparam +from sqlalchemy import select, update, func, text, or_, and_, bindparam, literal_column, Float from typing import List, Tuple, Dict, Optional import uuid from datetime import datetime @@ -9,6 +9,7 @@ from data_chain.config.config import config from data_chain.stores.database.database import DocumentEntity, ChunkEntity, DataBase from data_chain.manager.knowledge_manager import KnowledgeBaseManager from data_chain.logger.logger import logger as logging +from data_chain.parser.tools.token_tool import TokenTool import logging @@ -254,14 +255,14 @@ class ChunkManager(): # 构建查询SQL(移除USE INDEX,保留其他原有逻辑) base_sql = f""" - SELECT - chunk.id, chunk.team_id, chunk.kb_id, chunk.doc_id, chunk.doc_name, - chunk.text, chunk.text_vector, chunk.tokens, chunk.type, - chunk.pre_id_in_parse_topology, chunk.parse_topology_type, - chunk.global_offset, chunk.local_offset, chunk.enabled, + SELECT + chunk.id, chunk.team_id, chunk.kb_id, chunk.doc_id, chunk.doc_name, + chunk.text, chunk.text_vector, chunk.tokens, chunk.type, + chunk.pre_id_in_parse_topology, chunk.parse_topology_type, + chunk.global_offset, chunk.local_offset, chunk.enabled, chunk.status, chunk.created_time, chunk.updated_time, chunk.text_vector <=> :vector AS similarity_score - FROM chunk + FROM chunk JOIN document ON document.id = chunk.doc_id WHERE {where_clause} AND (chunk.text_vector <=> :vector) IS NOT NULL @@ -392,10 +393,9 @@ class ChunkManager(): return [] @staticmethod - async def get_top_k_chunk_by_kb_id_dynamic_weighted_keyword( - kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本 - top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = [], - chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None) -> List[ChunkEntity]: + async def get_top_k_chunk_by_kb_id_bm25(kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本 + top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = [], + chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None) -> List[ChunkEntity]: """根据知识库ID和查询文本查询文档解析结果(使用BM25直接打分)""" try: st = datetime.now() @@ -410,7 +410,8 @@ class ChunkManager(): select( ChunkEntity, # 计算查询文本与chunk的BM25分数 - ChunkEntity.text.op('<&>')(query_param).label("similarity_score") + ChunkEntity.text.op('<&>')( + query_param).label("similarity_score") ) # 关联文档表 .join(DocumentEntity, DocumentEntity.id == ChunkEntity.doc_id) @@ -462,7 +463,130 @@ class ChunkManager(): err = f"BM25查询失败: kb_id={kb_id}, query={query[:50]}..., error={str(e)[:150]}" logging.exception("[ChunkManager] %s", err) return [] - + + @staticmethod + async def get_top_k_chunk_by_kb_id_jieba(kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本 + top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = [], + chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None) -> List[ChunkEntity]: + """根据知识库ID和关键词权重查询文档解析结果(修复NoneType报错+强制索引)""" + try: + keywords, weights = TokenTool.get_top_k_keywords_and_weights( + query, top_k=20) + st = datetime.now() + async with await DataBase.get_session() as session: + # 1. 分词器选择(保留原逻辑) + kb_entity = await KnowledgeBaseManager.get_knowledge_base_by_kb_id(kb_id) + if kb_entity.tokenizer == Tokenizer.ZH.value: + tokenizer = 'chparser' if config['DATABASE_TYPE'].lower( + ) == 'opengauss' else 'zhparser' + elif kb_entity.tokenizer == Tokenizer.EN.value: + tokenizer = 'english' + else: + tokenizer = 'chparser' if config['DATABASE_TYPE'].lower( + ) == 'opengauss' else 'zhparser' + + # 2. 构建加权关键词CTE(保留原逻辑) + params = {} + values_clause = [] + for idx, (term, weight) in enumerate(zip(keywords, weights)): + params[f"term_{idx}"] = term + params[f"weight_{idx}"] = weight + values_clause.append( + f"(CAST(:term_{idx} AS TEXT), CAST(:weight_{idx} AS FLOAT8))") + values_text = f"(VALUES {', '.join(values_clause)}) AS t(term, weight)" + weighted_terms = ( + select( + literal_column("t.term").label("term"), + literal_column("t.weight").cast(Float).label("weight") + ) + .select_from(text(values_text)) + .cte("weighted_terms") + ) + + # 3. 初始化查询(确保stmt始终是Select对象,不直接赋值None) + stmt = ( + select( + ChunkEntity, + func.sum( + func.ts_rank_cd(ChunkEntity.text_ts_vector, func.to_tsquery( + tokenizer, weighted_terms.c.term)) + * weighted_terms.c.weight + ).label("similarity_score") + ) + # 关联文档表 + .join(DocumentEntity, DocumentEntity.id == ChunkEntity.doc_id) + .join( # 关联CTE+强制触发GIN索引(核心优化) + weighted_terms, + ChunkEntity.text_ts_vector.op( + '@@')(func.to_tsquery(tokenizer, weighted_terms.c.term)), + isouter=False + ) + # 基础过滤条件 + .where(DocumentEntity.enabled == True) + .where(DocumentEntity.status != DocumentStatus.DELETED.value) + .where(ChunkEntity.kb_id == kb_id) + .where(ChunkEntity.enabled == True) + .where(ChunkEntity.status != ChunkStatus.DELETED.value) + ) + + # 4. 动态条件:禁用ID(修复关键:用if-else确保stmt不被赋值为None) + if banned_ids: + stmt = stmt.where(ChunkEntity.id.notin_(banned_ids)) + + # 5. 其他动态条件(同样用if-else确保链式调用不中断) + if doc_ids is not None: + stmt = stmt.where(DocumentEntity.id.in_(doc_ids)) + if chunk_to_type is not None: + stmt = stmt.where( + ChunkEntity.parse_topology_type == chunk_to_type) + if pre_ids is not None: + stmt = stmt.where( + ChunkEntity.pre_id_in_parse_topology.in_(pre_ids)) + + # 6. 分组、过滤分数、排序、限制行数(链式调用安全) + stmt = (stmt + .group_by(ChunkEntity.id) # 按chunk分组计算总权重 + .order_by( # 按总分数降序 + func.sum( + func.ts_rank_cd(ChunkEntity.text_ts_vector, func.to_tsquery( + tokenizer, weighted_terms.c.term)) + * weighted_terms.c.weight + ).desc() + ) + .limit(top_k) # 限制返回数量 + ) + + # 7. 执行查询与结果处理(保留原逻辑) + result = await session.execute(stmt, params=params) + chunk_entities = result.scalars().all() + + # 8. 日志输出 + cost = (datetime.now() - st).total_seconds() + logging.warning( + f"[ChunkManager] get_top_k_chunk_by_kb_id_dynamic_weighted_keyword cost: {cost}s " + f"| kb_id: {kb_id} | keywords: {keywords[:2]}... | match_count: {len(chunk_entities)}" + ) + return chunk_entities + + except Exception as e: + # 异常日志补充关键上下文 + err = f"根据知识库ID和关键词权重查询失败: kb_id={kb_id}, keywords={keywords[:2]}..., error={str(e)[:150]}" + logging.exception("[ChunkManager] %s", err) + return [] + + @staticmethod + async def get_top_k_chunk_by_kb_id_dynamic_weighted_keyword( + kb_id: uuid.UUID, query: str, + top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = [], + chunk_to_type: str = None, pre_ids: list[uuid.UUID] = None) -> List[ChunkEntity]: + """根据知识库ID和关键词权重查询文档解析结果(动态加权关键词)""" + if config['DATABASE_TYPE'].lower() == 'postgres': + return await ChunkManager.get_top_k_chunk_by_kb_id_jieba( + kb_id, query, top_k, doc_ids, banned_ids, chunk_to_type, pre_ids) + else: + return await ChunkManager.get_top_k_chunk_by_kb_id_bm25( + kb_id, query, top_k, doc_ids, banned_ids, chunk_to_type, pre_ids) + @staticmethod async def fetch_surrounding_chunk_by_doc_id_and_global_offset( doc_id: uuid.UUID, global_offset: int, diff --git a/data_chain/manager/document_manager.py b/data_chain/manager/document_manager.py index 205ca937243c6d727568c9106886655990ca1913..ca271fbd84fa8b175d06d454294935d4191e3d73 100644 --- a/data_chain/manager/document_manager.py +++ b/data_chain/manager/document_manager.py @@ -11,6 +11,7 @@ from data_chain.entities.enum import KnowledgeBaseStatus, DocumentStatus from data_chain.manager.knowledge_manager import KnowledgeBaseManager from data_chain.entities.enum import Tokenizer, ChunkStatus from data_chain.entities.request_data import ListDocumentRequest +from data_chain.parser.tools.token_tool import TokenTool from data_chain.logger.logger import logger as logging @@ -212,7 +213,7 @@ class DocumentManager(): raise e @staticmethod - async def get_top_k_document_by_kb_id_dynamic_weighted_keyword( + async def get_top_k_document_by_kb_id_bm25( kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本,移除weights参数 top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = []) -> List[DocumentEntity]: """根据知识库ID和查询文本查询文档(BM25检索版,匹配abstract_bm25_index索引)""" @@ -273,6 +274,117 @@ class DocumentManager(): logging.exception("[DocumentManager] %s", err) return [] + @staticmethod + async def get_top_k_document_by_kb_id_jieba( + kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本,移除weights参数 + top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = []) -> List[DocumentEntity]: + try: + keywords, weights = TokenTool.get_top_k_keywords_and_weights(query) + st = datetime.now() # 新增计时日志 + async with await DataBase.get_session() as session: + # 1. 分词器选择(与第一个方法保持一致) + kb_entity = await KnowledgeBaseManager.get_knowledge_base_by_kb_id(kb_id) + if kb_entity.tokenizer == Tokenizer.ZH.value: + tokenizer = 'chparser' if config['DATABASE_TYPE'].lower( + ) == 'opengauss' else 'zhparser' + elif kb_entity.tokenizer == Tokenizer.EN.value: + tokenizer = 'english' + else: + tokenizer = 'chparser' if config['DATABASE_TYPE'].lower( + ) == 'opengauss' else 'zhparser' + + # 2. 构建加权关键词CTE(保留原逻辑) + params = {} + values_clause = [] + for idx, (term, weight) in enumerate(zip(keywords, weights)): + params[f"term_{idx}"] = term + params[f"weight_{idx}"] = weight + values_clause.append( + f"(CAST(:term_{idx} AS TEXT), CAST(:weight_{idx} AS FLOAT8))") + values_text = f"(VALUES {', '.join(values_clause)}) AS t(term, weight)" + weighted_terms = ( + select( + literal_column("t.term").label("term"), + literal_column("t.weight").cast(Float).label("weight") + ) + .select_from(text(values_text)) + .cte("weighted_terms") + ) + + # 3. 初始化查询(确保stmt始终是Select对象) + stmt = ( + select( + DocumentEntity, + func.sum( + func.ts_rank_cd(DocumentEntity.abstract_ts_vector, func.to_tsquery( + tokenizer, weighted_terms.c.term)) + * weighted_terms.c.weight + ).label("similarity_score") + ) + # 关联CTE+强制触发GIN索引(核心优化) + .join( + weighted_terms, + DocumentEntity.abstract_ts_vector.op( + '@@')(func.to_tsquery(tokenizer, weighted_terms.c.term)), + isouter=False + ) + # 基础过滤条件 + .where(DocumentEntity.enabled == True) + .where(DocumentEntity.status != DocumentStatus.DELETED.value) + .where(DocumentEntity.kb_id == kb_id) + ) + + # 4. 动态条件:禁用ID(确保stmt链式调用不中断) + if banned_ids: + stmt = stmt.where(DocumentEntity.id.notin_(banned_ids)) + + # 5. 其他动态条件 + if doc_ids is not None: + stmt = stmt.where(DocumentEntity.id.in_(doc_ids)) + + # 6. 分组、过滤分数、排序、限制行数(链式调用安全) + stmt = (stmt + .group_by(DocumentEntity.id) # 按文档ID分组计算总权重 + .order_by( # 按总分数降序 + func.sum( + func.ts_rank_cd(DocumentEntity.abstract_ts_vector, func.to_tsquery( + tokenizer, weighted_terms.c.term)) + * weighted_terms.c.weight + ).desc() + ) + .limit(top_k) # 限制返回数量 + ) + + # 7. 执行查询与结果处理 + result = await session.execute(stmt, params=params) + doc_entities = result.scalars().all() + + # 8. 新增执行时间日志 + cost = (datetime.now() - st).total_seconds() + logging.warning( + f"[DocumentManager] get_top_k_document_by_kb_id_dynamic_weighted_keyword cost: {cost}s " + f"| kb_id: {kb_id} | keywords: {keywords[:2]}... | match_count: {len(doc_entities)}" + ) + return doc_entities + + except Exception as e: + # 异常日志补充关键上下文 + err = f"根据知识库ID和关键词权重查询文档失败: kb_id={kb_id}, keywords={keywords[:2]}..., error={str(e)[:150]}" + logging.exception("[DocumentManager] %s", err) + return [] + + @staticmethod + async def get_top_k_document_by_kb_id_dynamic_weighted_keyword( + kb_id: uuid.UUID, query: str, # 关键词列表改为单查询文本,移除weights参数 + top_k: int, doc_ids: list[uuid.UUID] = None, banned_ids: list[uuid.UUID] = []) -> List[DocumentEntity]: + """根据知识库ID和查询文本查询文档(动态加权关键词版)""" + if config['DATABASE_TYPE'].lower() == 'postgres': + return await DocumentManager.get_top_k_document_by_kb_id_jieba( + kb_id, query, top_k, doc_ids, banned_ids) + else: + return await DocumentManager.get_top_k_document_by_kb_id_bm25( + kb_id, query, top_k, doc_ids, banned_ids) + @staticmethod async def get_doc_cnt_by_kb_id(kb_id: uuid.UUID) -> int: """根据知识库ID获取文档数量""" diff --git a/utils/document/upload.py b/utils/document/upload.py new file mode 100644 index 0000000000000000000000000000000000000000..7c6aa500d0debcdd91dadb89c90477e3d1fb5710 --- /dev/null +++ b/utils/document/upload.py @@ -0,0 +1,115 @@ +import argparse +import os +import requests +import uuid +from pathlib import Path + +def upload_documents(api_url, token, kb_id, doc_dir, batch_size=100): + """ + 批量上传文档到指定知识库 + + Args: + api_url: API接口地址 + token: 认证令牌 + kb_id: 知识库ID + doc_dir: 文档所在目录 + batch_size: 每次上传的文档数量 + """ + # 检查文档目录是否存在 + if not os.path.isdir(doc_dir): + print(f"错误:文档目录 {doc_dir} 不存在") + return + + # 获取目录下的所有文件 + doc_files = [f for f in os.listdir(doc_dir) + if os.path.isfile(os.path.join(doc_dir, f))] + + if not doc_files: + print(f"警告:文档目录 {doc_dir} 中没有文件") + return + + print(f"发现 {len(doc_files)} 个文件,将以每次 {batch_size} 个的批次上传") + + # 按批次处理文件 + total_uploaded = 0 + headers = {"Authorization": f"Bearer {token}"} + + for i in range(0, len(doc_files), batch_size): + batch = doc_files[i:i+batch_size] + print(f"\n正在上传第 {i//batch_size + 1} 批,共 {len(batch)} 个文件...") + + # 准备文件数据 + files = [] + for filename in batch: + file_path = os.path.join(doc_dir, filename) + try: + files.append( + ('docs', (filename, open(file_path, 'rb'), None)) + ) + except Exception as e: + print(f"无法打开文件 {filename}:{str(e)},已跳过") + + if not files: + print("本批次没有可上传的文件,跳过") + continue + + # 发送请求 + try: + response = requests.post( + f"{api_url}/doc", + headers=headers, + params={"kbId": kb_id}, + files=files + ) + + # 关闭所有文件 + for _, (_, file_obj, _) in files: + file_obj.close() + + if response.status_code == 200: + result = response.json() + if result.get("code") == 200 and "result" in result: + print(f"成功上传 {len(result['result'])} 个文件,文档ID:") + print(result["result"]) + total_uploaded += len(result["result"]) + else: + print(f"上传失败:{result.get('message', '未知错误')}") + else: + print(f"请求失败,状态码:{response.status_code}") + print(f"响应内容:{response.text}") + + except Exception as e: + print(f"上传过程出错:{str(e)}") + + print(f"\n上传完成,共成功上传 {total_uploaded} 个文件") + +def main(): + parser = argparse.ArgumentParser(description='批量上传文档到知识库') + parser.add_argument('doc_dir', help='文档所在的目录') + parser.add_argument('kb_id', help='知识库ID (kb_id)') + parser.add_argument('--batch-size', type=int, default=100, + help='每次上传的文档数量,默认100') + parser.add_argument('--api-url', required=True, + help='API接口基础地址 (例如: http://localhost:9988)') + parser.add_argument('--token', required=True, + help='认证令牌 (Bearer token)') + + args = parser.parse_args() + + # 验证kb_id格式 + try: + uuid.UUID(args.kb_id) + except ValueError: + print(f"错误:{args.kb_id} 不是有效的UUID格式") + return + + upload_documents( + api_url=args.api_url, + token=args.token, + kb_id=args.kb_id, + doc_dir=args.doc_dir, + batch_size=args.batch_size + ) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/README.md b/utils/embedding/README.md similarity index 100% rename from utils/README.md rename to utils/embedding/README.md diff --git a/utils/__init__.py b/utils/embedding/__init__.py similarity index 100% rename from utils/__init__.py rename to utils/embedding/__init__.py diff --git a/utils/common/.env.example b/utils/embedding/common/.env.example similarity index 100% rename from utils/common/.env.example rename to utils/embedding/common/.env.example diff --git a/utils/config/config.py b/utils/embedding/config/config.py similarity index 100% rename from utils/config/config.py rename to utils/embedding/config/config.py diff --git a/utils/docs/prompt.yaml b/utils/embedding/docs/prompt.yaml similarity index 100% rename from utils/docs/prompt.yaml rename to utils/embedding/docs/prompt.yaml diff --git a/utils/docs/sensitive_patterns.txt b/utils/embedding/docs/sensitive_patterns.txt similarity index 100% rename from utils/docs/sensitive_patterns.txt rename to utils/embedding/docs/sensitive_patterns.txt diff --git a/utils/docs/sensitive_words.txt b/utils/embedding/docs/sensitive_words.txt similarity index 100% rename from utils/docs/sensitive_words.txt rename to utils/embedding/docs/sensitive_words.txt diff --git a/utils/docs/term_replacements.txt b/utils/embedding/docs/term_replacements.txt similarity index 100% rename from utils/docs/term_replacements.txt rename to utils/embedding/docs/term_replacements.txt diff --git a/utils/main.py b/utils/embedding/main.py similarity index 100% rename from utils/main.py rename to utils/embedding/main.py diff --git a/utils/my_tools/__init__.py b/utils/embedding/my_tools/__init__.py similarity index 100% rename from utils/my_tools/__init__.py rename to utils/embedding/my_tools/__init__.py diff --git a/utils/my_tools/bge_finetune/README.md b/utils/embedding/my_tools/bge_finetune/README.md similarity index 100% rename from utils/my_tools/bge_finetune/README.md rename to utils/embedding/my_tools/bge_finetune/README.md diff --git a/utils/my_tools/bge_finetune/__init__.py b/utils/embedding/my_tools/bge_finetune/__init__.py similarity index 100% rename from utils/my_tools/bge_finetune/__init__.py rename to utils/embedding/my_tools/bge_finetune/__init__.py diff --git a/utils/my_tools/bge_finetune/data_processing.py b/utils/embedding/my_tools/bge_finetune/data_processing.py similarity index 100% rename from utils/my_tools/bge_finetune/data_processing.py rename to utils/embedding/my_tools/bge_finetune/data_processing.py diff --git a/utils/my_tools/bge_finetune/ds_stage0.json b/utils/embedding/my_tools/bge_finetune/ds_stage0.json similarity index 100% rename from utils/my_tools/bge_finetune/ds_stage0.json rename to utils/embedding/my_tools/bge_finetune/ds_stage0.json diff --git a/utils/my_tools/bge_finetune/eval.py b/utils/embedding/my_tools/bge_finetune/eval.py similarity index 100% rename from utils/my_tools/bge_finetune/eval.py rename to utils/embedding/my_tools/bge_finetune/eval.py diff --git a/utils/my_tools/bge_finetune/get_report.py b/utils/embedding/my_tools/bge_finetune/get_report.py similarity index 100% rename from utils/my_tools/bge_finetune/get_report.py rename to utils/embedding/my_tools/bge_finetune/get_report.py diff --git a/utils/my_tools/bge_finetune/hn_mine.py b/utils/embedding/my_tools/bge_finetune/hn_mine.py similarity index 100% rename from utils/my_tools/bge_finetune/hn_mine.py rename to utils/embedding/my_tools/bge_finetune/hn_mine.py diff --git a/utils/my_tools/bge_finetune/how_to_better.py b/utils/embedding/my_tools/bge_finetune/how_to_better.py similarity index 100% rename from utils/my_tools/bge_finetune/how_to_better.py rename to utils/embedding/my_tools/bge_finetune/how_to_better.py diff --git a/utils/my_tools/llm.py b/utils/embedding/my_tools/llm.py similarity index 100% rename from utils/my_tools/llm.py rename to utils/embedding/my_tools/llm.py diff --git a/utils/my_tools/logger.py b/utils/embedding/my_tools/logger.py similarity index 100% rename from utils/my_tools/logger.py rename to utils/embedding/my_tools/logger.py diff --git a/utils/parser/handler/base_parser.py b/utils/embedding/parser/handler/base_parser.py similarity index 100% rename from utils/parser/handler/base_parser.py rename to utils/embedding/parser/handler/base_parser.py diff --git a/utils/parser/handler/doc_parser.py b/utils/embedding/parser/handler/doc_parser.py similarity index 100% rename from utils/parser/handler/doc_parser.py rename to utils/embedding/parser/handler/doc_parser.py diff --git a/utils/parser/handler/docx_parser.py b/utils/embedding/parser/handler/docx_parser.py similarity index 100% rename from utils/parser/handler/docx_parser.py rename to utils/embedding/parser/handler/docx_parser.py diff --git a/utils/parser/handler/html_parser.py b/utils/embedding/parser/handler/html_parser.py similarity index 100% rename from utils/parser/handler/html_parser.py rename to utils/embedding/parser/handler/html_parser.py diff --git a/utils/parser/handler/md_parser.py b/utils/embedding/parser/handler/md_parser.py similarity index 100% rename from utils/parser/handler/md_parser.py rename to utils/embedding/parser/handler/md_parser.py diff --git a/utils/parser/handler/pdf_parser.py b/utils/embedding/parser/handler/pdf_parser.py similarity index 100% rename from utils/parser/handler/pdf_parser.py rename to utils/embedding/parser/handler/pdf_parser.py diff --git a/utils/parser/handler/txt_parser.py b/utils/embedding/parser/handler/txt_parser.py similarity index 100% rename from utils/parser/handler/txt_parser.py rename to utils/embedding/parser/handler/txt_parser.py diff --git a/utils/parser/handler/xlsx_parser.py b/utils/embedding/parser/handler/xlsx_parser.py similarity index 100% rename from utils/parser/handler/xlsx_parser.py rename to utils/embedding/parser/handler/xlsx_parser.py diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams.info b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams.info similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams.info rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdiparams.info diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdmodel b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdmodel similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdmodel rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_det_infer/inference.pdmodel diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams.info b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams.info similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams.info rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdiparams.info diff --git a/utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdmodel b/utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdmodel similarity index 100% rename from utils/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdmodel rename to utils/embedding/parser/model/ocr/ch_PP-OCRv4_rec_infer/inference.pdmodel diff --git a/utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams b/utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams similarity index 100% rename from utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams rename to utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams diff --git a/utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams.info b/utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams.info similarity index 100% rename from utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams.info rename to utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdiparams.info diff --git a/utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel b/utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel similarity index 100% rename from utils/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel rename to utils/embedding/parser/model/ocr/ch_ppocr_mobile_v2.0_cls_infer/inference.pdmodel diff --git a/utils/parser/service/parser_service.py b/utils/embedding/parser/service/parser_service.py similarity index 100% rename from utils/parser/service/parser_service.py rename to utils/embedding/parser/service/parser_service.py diff --git a/utils/parser/tools/docx_patch.py b/utils/embedding/parser/tools/docx_patch.py similarity index 100% rename from utils/parser/tools/docx_patch.py rename to utils/embedding/parser/tools/docx_patch.py diff --git a/utils/parser/tools/ocr.py b/utils/embedding/parser/tools/ocr.py similarity index 100% rename from utils/parser/tools/ocr.py rename to utils/embedding/parser/tools/ocr.py diff --git a/utils/parser/tools/split.py b/utils/embedding/parser/tools/split.py similarity index 100% rename from utils/parser/tools/split.py rename to utils/embedding/parser/tools/split.py diff --git a/utils/service/document_governance.py b/utils/embedding/service/document_governance.py similarity index 100% rename from utils/service/document_governance.py rename to utils/embedding/service/document_governance.py diff --git a/utils/service/embedding_training.py b/utils/embedding/service/embedding_training.py similarity index 100% rename from utils/service/embedding_training.py rename to utils/embedding/service/embedding_training.py diff --git a/utils/service/qa_generate.py b/utils/embedding/service/qa_generate.py similarity index 100% rename from utils/service/qa_generate.py rename to utils/embedding/service/qa_generate.py