diff --git a/data_chain/apps/base/task/worker/parse_document_worker.py b/data_chain/apps/base/task/worker/parse_document_worker.py index d97cb3a0996015baf37c609028e61c112fb0e4eb..ec64c48d88ee045c778ab41b7d46dfd252f0a541 100644 --- a/data_chain/apps/base/task/worker/parse_document_worker.py +++ b/data_chain/apps/base/task/worker/parse_document_worker.py @@ -438,12 +438,15 @@ class ParseDocumentWorker(BaseWorker): else: abstract = full_text[:128] abstract_vector = await Embedding.vectorize_embedding(abstract) + + # 更新文档的摘要和全文 await DocumentManager.update_document_by_doc_id( doc_id, { "full_text": full_text, "abstract": abstract, - "abstract_vector": abstract_vector + "abstract_vector": abstract_vector, + "full_text": full_text.strip() } ) return abstract diff --git a/data_chain/apps/router/document.py b/data_chain/apps/router/document.py index 4d76828cde61dd295032ff5dbdc1a07b0db252a4..1cae15e19fe572ef370e79eb93d99c1f4f92bb54 100644 --- a/data_chain/apps/router/document.py +++ b/data_chain/apps/router/document.py @@ -28,7 +28,9 @@ from data_chain.entities.response_data import ( GetTemporaryDocumentStatusResponse, UploadTemporaryDocumentResponse, GetTemporaryDocumentTextResponse, - DeleteTemporaryDocumentResponse + DeleteTemporaryDocumentResponse, + DeleteTemporaryDocumentResponse, + GetDocumentFullTextResponse ) from data_chain.apps.service.session_service import get_user_sub, verify_user from data_chain.apps.service.router_service import get_route_info @@ -203,3 +205,14 @@ async def delete_temporary_docs( req: Annotated[DeleteTemporaryDocumentRequest, Body()]): doc_ids = await DocumentService.delete_temporary_docs(user_sub, req.ids) return DeleteTemporaryDocumentResponse(result=doc_ids) + + +@router.get('/full_text', response_model=GetDocumentFullTextResponse, dependencies=[Depends(verify_user)]) +async def get_doc_full_text( + user_sub: Annotated[str, Depends(get_user_sub)], + action: Annotated[str, Depends(get_route_info)], + doc_id: Annotated[UUID, Query(alias="docId")]): + if not (await DocumentService.validate_user_action_to_document(user_sub, doc_id, action)): + raise Exception("用户没有权限访问该文档") + full_text = await DocumentService.get_doc_full_text(doc_id) + return GetDocumentFullTextResponse(result=full_text) diff --git a/data_chain/apps/router/other.py b/data_chain/apps/router/other.py index 24f9d001a069d4bc8fe4059392c7a5f78e21d1bd..ec2b28140e51a8c23630acf534e65cada51ec47c 100644 --- a/data_chain/apps/router/other.py +++ b/data_chain/apps/router/other.py @@ -12,6 +12,7 @@ from data_chain.entities.response_data import ( ListLLMMsg, ListLLMResponse, ListEmbeddingResponse, + ListRerankerResponse, ListTokenizerResponse, ListParseMethodResponse, ListSearchMethodResponse @@ -56,6 +57,22 @@ async def list_embeddings(): return ListEmbeddingResponse(result=embeddings) +@router.get("/reranker", response_model=ListRerankerResponse, dependencies=[Depends(verify_user)]) +async def get_reranker_config(): + return ListRerankerResponse( + result=[ + { + 'type': 'algorithm', + 'name': 'jaccard_dis_reranker' + }, + { + 'type': config['RERANKER_TYPE'], + 'name': config['RERANKER_MODEL_NAME'] + } + ] + ) + + @router.get('/tokenizer', response_model=ListTokenizerResponse, dependencies=[Depends(verify_user)]) async def list_tokenizers(): tokenizers = [tokenizer.value for tokenizer in Tokenizer] diff --git a/data_chain/apps/service/document_service.py b/data_chain/apps/service/document_service.py index 7ec05556bb5381615813ff6bc6829f8dc35ea9ec..2f657bf9c4cf707258b739b06aa7dc4fe7fbb7f4 100644 --- a/data_chain/apps/service/document_service.py +++ b/data_chain/apps/service/document_service.py @@ -464,3 +464,20 @@ class DocumentService: err = "删除文档失败" logging.exception("[DocumentService] %s", err) raise e + + @staticmethod + async def get_doc_full_text(doc_id: uuid.UUID) -> str: + """根据文档ID获取文档全文""" + try: + doc_entity = await DocumentManager.get_document_by_doc_id(doc_id) + if doc_entity is None: + err = f"文档不存在, 文档ID: {doc_id}" + logging.error("[DocumentService] %s", err) + raise Exception(err) + + # 返回文档的全文内容,如果为None则返回空字符串 + return doc_entity.full_text or "" + except Exception as e: + err = f"获取文档全文失败, 文档ID: {doc_id}" + logging.exception("[DocumentService] %s", err) + raise e diff --git a/data_chain/config/config.py b/data_chain/config/config.py index 2565e0dcd8ef107e7886e350a417b1f273e85b15..96b3f0716233d52995f94ad2fc8a004eb6e980ba 100644 --- a/data_chain/config/config.py +++ b/data_chain/config/config.py @@ -56,6 +56,11 @@ class ConfigModel(DictBaseModel): EMBEDDING_API_KEY: str = Field(None, description="embedding服务api key") EMBEDDING_ENDPOINT: str = Field(None, description="embedding服务url地址") EMBEDDING_MODEL_NAME: str = Field(None, description="embedding模型名称") + # Reranker + RERANKER_TYPE: str = Field(default="openai", description="reranker 服务的类型") + RERANKER_API_KEY: str = Field(None, description="reranker服务api key") + RERANKER_ENDPOINT: str = Field(None, description="reranker服务url地址") + RERANKER_MODEL_NAME: str = Field(None, description="reranker模型名称") # Token SESSION_TTL: int = Field(None, description="用户session过期时间") CSRF_KEY: str = Field(None, description="csrf的密钥") diff --git a/data_chain/entities/response_data.py b/data_chain/entities/response_data.py index a81e892f042ae6c13aacf0c29a0a8723029c3f32..778994753f2b55d6851b2df7418e4fcd0d16a6bb 100644 --- a/data_chain/entities/response_data.py +++ b/data_chain/entities/response_data.py @@ -304,6 +304,11 @@ class DeleteDocumentResponse(ResponseData): result: list[uuid.UUID] = Field(default=[], description="文档ID列表") +class GetDocumentFullTextResponse(ResponseData): + """GET /doc/full_text 响应""" + result: str = Field(default="", description="文档全文内容") + + class Chunk(BaseModel): """文档分片信息""" chunk_id: uuid.UUID = Field(description="分片ID", alias="chunkId") @@ -677,6 +682,11 @@ class ListEmbeddingResponse(ResponseData): result: list[str] = Field(default=[], description="向量化模型的列表数据结构") +class ListRerankerResponse(ResponseData): + """GET /other/reranker 数据结构""" + result: list[dict] = Field(default=[], description="重排序模型的列表数据结构") + + class ListTokenizerResponse(ResponseData): """GET /other/tokenizer 响应""" result: list[str] = Field(default=[], description="分词器的列表数据结构")