From 788b9e597437a357cb43f7b4e07f224a7585d3bb Mon Sep 17 00:00:00 2001 From: Ethan-Zhang Date: Fri, 15 Aug 2025 09:51:31 +0800 Subject: [PATCH] =?UTF-8?q?Feat:=20=E6=96=B0=E5=A2=9Efull=20text=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=8E=A5=E5=8F=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../base/task/worker/parse_document_worker.py | 5 ++++- data_chain/apps/router/document.py | 15 ++++++++++++++- data_chain/apps/service/document_service.py | 17 +++++++++++++++++ data_chain/entities/response_data.py | 5 +++++ 4 files changed, 40 insertions(+), 2 deletions(-) diff --git a/data_chain/apps/base/task/worker/parse_document_worker.py b/data_chain/apps/base/task/worker/parse_document_worker.py index d97cb3a..ec64c48 100644 --- a/data_chain/apps/base/task/worker/parse_document_worker.py +++ b/data_chain/apps/base/task/worker/parse_document_worker.py @@ -438,12 +438,15 @@ class ParseDocumentWorker(BaseWorker): else: abstract = full_text[:128] abstract_vector = await Embedding.vectorize_embedding(abstract) + + # 更新文档的摘要和全文 await DocumentManager.update_document_by_doc_id( doc_id, { "full_text": full_text, "abstract": abstract, - "abstract_vector": abstract_vector + "abstract_vector": abstract_vector, + "full_text": full_text.strip() } ) return abstract diff --git a/data_chain/apps/router/document.py b/data_chain/apps/router/document.py index 4d76828..1cae15e 100644 --- a/data_chain/apps/router/document.py +++ b/data_chain/apps/router/document.py @@ -28,7 +28,9 @@ from data_chain.entities.response_data import ( GetTemporaryDocumentStatusResponse, UploadTemporaryDocumentResponse, GetTemporaryDocumentTextResponse, - DeleteTemporaryDocumentResponse + DeleteTemporaryDocumentResponse, + DeleteTemporaryDocumentResponse, + GetDocumentFullTextResponse ) from data_chain.apps.service.session_service import get_user_sub, verify_user from data_chain.apps.service.router_service import get_route_info @@ -203,3 +205,14 @@ async def delete_temporary_docs( req: Annotated[DeleteTemporaryDocumentRequest, Body()]): doc_ids = await DocumentService.delete_temporary_docs(user_sub, req.ids) return DeleteTemporaryDocumentResponse(result=doc_ids) + + +@router.get('/full_text', response_model=GetDocumentFullTextResponse, dependencies=[Depends(verify_user)]) +async def get_doc_full_text( + user_sub: Annotated[str, Depends(get_user_sub)], + action: Annotated[str, Depends(get_route_info)], + doc_id: Annotated[UUID, Query(alias="docId")]): + if not (await DocumentService.validate_user_action_to_document(user_sub, doc_id, action)): + raise Exception("用户没有权限访问该文档") + full_text = await DocumentService.get_doc_full_text(doc_id) + return GetDocumentFullTextResponse(result=full_text) diff --git a/data_chain/apps/service/document_service.py b/data_chain/apps/service/document_service.py index 7ec0555..2f657bf 100644 --- a/data_chain/apps/service/document_service.py +++ b/data_chain/apps/service/document_service.py @@ -464,3 +464,20 @@ class DocumentService: err = "删除文档失败" logging.exception("[DocumentService] %s", err) raise e + + @staticmethod + async def get_doc_full_text(doc_id: uuid.UUID) -> str: + """根据文档ID获取文档全文""" + try: + doc_entity = await DocumentManager.get_document_by_doc_id(doc_id) + if doc_entity is None: + err = f"文档不存在, 文档ID: {doc_id}" + logging.error("[DocumentService] %s", err) + raise Exception(err) + + # 返回文档的全文内容,如果为None则返回空字符串 + return doc_entity.full_text or "" + except Exception as e: + err = f"获取文档全文失败, 文档ID: {doc_id}" + logging.exception("[DocumentService] %s", err) + raise e diff --git a/data_chain/entities/response_data.py b/data_chain/entities/response_data.py index a81e892..c313a47 100644 --- a/data_chain/entities/response_data.py +++ b/data_chain/entities/response_data.py @@ -304,6 +304,11 @@ class DeleteDocumentResponse(ResponseData): result: list[uuid.UUID] = Field(default=[], description="文档ID列表") +class GetDocumentFullTextResponse(ResponseData): + """GET /doc/full_text 响应""" + result: str = Field(default="", description="文档全文内容") + + class Chunk(BaseModel): """文档分片信息""" chunk_id: uuid.UUID = Field(description="分片ID", alias="chunkId") -- Gitee