登录
注册
开源
企业版
高校版
搜索
帮助中心
使用条款
关于我们
开源
企业版
高校版
私有云
模力方舟
AI 队友
登录
注册
6 月10 日(下周三晚 19:00)直播预告:龙虾还活着吗?来看 PocketClaw 上市两个月后对 AI 硬件的真实复盘,点击预约,开播有提醒~
代码拉取完成,页面将自动刷新
开源项目
>
程序开发
>
常用工具包
&&
捐赠
捐赠前请先登录
取消
前往登录
扫描微信二维码支付
取消
支付完成
支付提示
将跳转至支付宝完成支付
确定
取消
Watch
不关注
关注所有动态
仅关注版本发行动态
关注但不提醒动态
97
Star
852
Fork
237
程序员晚枫
/
python-office
代码
Issues
12
Pull Requests
0
Wiki
统计
流水线
服务
质量分析
Jenkins for Gitee
腾讯云托管
腾讯云 Serverless
悬镜安全
阿里云 SAE
Codeblitz
SBOM
开发画像分析
我知道了,不再自动展开
更新失败,请稍后重试!
移除标识
内容风险标识
本任务被
标识为内容中包含有代码安全 Bug 、隐私泄露等敏感信息,仓库外成员不可访问
尝试使用ocr但打包无法运行,word2pdf以及pdf转ocr需要用到以前教程的才可以打包,用整合的office库则不行。附上我运行的文件
已完成
#I5AYLM
Henshin枫子
创建于
2022-06-07 00:07
C:\Users\Fwhui>C:\AutoWork\dist\Fwhui\Fwhui.exe Traceback (most recent call last): File "Fwhui.py", line 8, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddleocr\__init__.py", line 14, in <module> from .paddleocr import * File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddleocr\paddleocr.py", line 21, in <module> import paddle File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\__init__.py", line 71, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\__init__.py", line 27, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\flowers.py", line 39, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\image.py", line 51, in <module> File "subprocess.py", line 800, in __init__ File "subprocess.py", line 1207, in _execute_child FileNotFoundError: [WinError 2] 系统找不到指定的文件。 [17932] Failed to execute script 'Fwhui' due to unhandled exception! ``` import PyPDF2 import pdfplumber from docx import Document import time from pathlib import Path from win32com.client import constants, gencache import os # 目录的操作 from paddleocr import PaddleOCR, draw_ocr def ocr(img_path): # pip install paddlepaddle # pip install shapely # pip install paddleocr # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 # from paddleocr import PaddleOCR, draw_ocr # 模型路径下必须含有model和params文件 ocr = PaddleOCR(use_angle_cls=True, use_gpu=False) # det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', # rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True img_path = lujing result = ocr.ocr(img_path, cls=True) f = open("识别结果.txt", "w") for line in result: print(line) f.writelines([str(line), '\n']) from PIL import Image f.close() image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] scores = [line[1][1] for line in result] # im_show = \ draw_ocr(image, boxes, txts, scores, font_path='simfang.ttf') # im_show = Image.fromarray(im_show) # im_show.save('result.jpg') # 结果图片保存在代码同级文件夹中。 # word转pdf------------------------------------------------# def createpdf(wordPath, pdfPath): word = gencache.EnsureDispatch('Word.Application') doc = word.Documents.Open(wordPath, ReadOnly=1) # 转换方法 doc.ExportAsFixedFormat(pdfPath, constants.wdExportFormatPDF) word.Quit() # 1、文件的批量转换 # 自己指定路径, # 为了适配wps不能转换doc的问题,这里限定:只能转换docx def docx2pdf(path, docxSuffix=".docx"): wordFiles = [] # 如果不存在,则不做处理 if not os.path.exists(path): print("path does not exist path = " + path) return # 判断是否是文件 elif os.path.isfile(path): print("path file type is file " + path) wordFiles.append(path) # 如果是目录,则遍历目录下面的文件 elif os.path.isdir(path): print(os.listdir(path)) # 填充路径,补充完整路径 if not path.endswith("/") or not path.endswith("\\"): path = path + "/" for file in os.listdir(path): if file.endswith(docxSuffix): wordFiles.append(path + file) print(wordFiles) for file in wordFiles: filepath = os.path.abspath(file) index = filepath.rindex('.') pdfPath = filepath[:index] + '.pdf' print(pdfPath) createpdf(filepath, pdfPath) # pdf转word------------------------------------------------# def pdf2word(pdf_path): # 内容提取,使用 pdfplumber 打开 PDF,用于提取文本 with pdfplumber.open(pdf_path) as pdf_file: # 使用 PyPDF2 打开 PDF 用于提取图片 pdf_image_reader = PyPDF2.PdfFileReader(open(pdf_path, "rb")) print(pdf_image_reader.getNumPages()) content = '' docx = Document() file_name = Path(pdf_path).stem # len(pdf.pages)为PDF文档页数,一页页解析 for i in range(len(pdf_file.pages)): print("当前第 %s 页" % i) # pdf.pages[i] 是读取PDF文档第i+1页 page_text = pdf_file.pages[i] # page.extract_text()函数即读取文本内容 page_content = page_text.extract_text() if page_content: content = content + page_content + "\n" print(page_content) docx.add_paragraph(str(page_content)) time.sleep(0.1) docx.save(str(file_name) + ".docx") # 主程序 while True: if __name__ == '__main__': key = input('\n选择功能: 0.清理缓存 1.pdf转word 2.word转pdf 3.图片转文字\n———————————————————————————————————————\n') if key == "1": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) pdf2word(lujing) elif key == "0": print("正清理缓存文件...") os.system('@echo off & for /d %i in (%temp%\^_MEI*) do (rd /s /q "%i")>nul') exit("正在退出程序...") elif key == "2": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) docx2pdf(lujing) elif key == "3": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) ocr(lujing) ``` 文件链接:链接:https://pan.baidu.com/s/140gzuYsiK9bvQH0mkidHpg 提取码:xp4d
C:\Users\Fwhui>C:\AutoWork\dist\Fwhui\Fwhui.exe Traceback (most recent call last): File "Fwhui.py", line 8, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddleocr\__init__.py", line 14, in <module> from .paddleocr import * File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddleocr\paddleocr.py", line 21, in <module> import paddle File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\__init__.py", line 71, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\__init__.py", line 27, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\flowers.py", line 39, in <module> File "<frozen importlib._bootstrap>", line 983, in _find_and_load File "<frozen importlib._bootstrap>", line 967, in _find_and_load_unlocked File "<frozen importlib._bootstrap>", line 677, in _load_unlocked File "PyInstaller\loader\pyimod03_importers.py", line 495, in exec_module File "paddle\dataset\image.py", line 51, in <module> File "subprocess.py", line 800, in __init__ File "subprocess.py", line 1207, in _execute_child FileNotFoundError: [WinError 2] 系统找不到指定的文件。 [17932] Failed to execute script 'Fwhui' due to unhandled exception! ``` import PyPDF2 import pdfplumber from docx import Document import time from pathlib import Path from win32com.client import constants, gencache import os # 目录的操作 from paddleocr import PaddleOCR, draw_ocr def ocr(img_path): # pip install paddlepaddle # pip install shapely # pip install paddleocr # Paddleocr目前支持中英文、英文、法语、德语、韩语、日语,可以通过修改lang参数进行切换 # 参数依次为`ch`, `en`, `french`, `german`, `korean`, `japan`。 # from paddleocr import PaddleOCR, draw_ocr # 模型路径下必须含有model和params文件 ocr = PaddleOCR(use_angle_cls=True, use_gpu=False) # det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', # rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True img_path = lujing result = ocr.ocr(img_path, cls=True) f = open("识别结果.txt", "w") for line in result: print(line) f.writelines([str(line), '\n']) from PIL import Image f.close() image = Image.open(img_path).convert('RGB') boxes = [line[0] for line in result] txts = [line[1][0] for line in result] scores = [line[1][1] for line in result] # im_show = \ draw_ocr(image, boxes, txts, scores, font_path='simfang.ttf') # im_show = Image.fromarray(im_show) # im_show.save('result.jpg') # 结果图片保存在代码同级文件夹中。 # word转pdf------------------------------------------------# def createpdf(wordPath, pdfPath): word = gencache.EnsureDispatch('Word.Application') doc = word.Documents.Open(wordPath, ReadOnly=1) # 转换方法 doc.ExportAsFixedFormat(pdfPath, constants.wdExportFormatPDF) word.Quit() # 1、文件的批量转换 # 自己指定路径, # 为了适配wps不能转换doc的问题,这里限定:只能转换docx def docx2pdf(path, docxSuffix=".docx"): wordFiles = [] # 如果不存在,则不做处理 if not os.path.exists(path): print("path does not exist path = " + path) return # 判断是否是文件 elif os.path.isfile(path): print("path file type is file " + path) wordFiles.append(path) # 如果是目录,则遍历目录下面的文件 elif os.path.isdir(path): print(os.listdir(path)) # 填充路径,补充完整路径 if not path.endswith("/") or not path.endswith("\\"): path = path + "/" for file in os.listdir(path): if file.endswith(docxSuffix): wordFiles.append(path + file) print(wordFiles) for file in wordFiles: filepath = os.path.abspath(file) index = filepath.rindex('.') pdfPath = filepath[:index] + '.pdf' print(pdfPath) createpdf(filepath, pdfPath) # pdf转word------------------------------------------------# def pdf2word(pdf_path): # 内容提取,使用 pdfplumber 打开 PDF,用于提取文本 with pdfplumber.open(pdf_path) as pdf_file: # 使用 PyPDF2 打开 PDF 用于提取图片 pdf_image_reader = PyPDF2.PdfFileReader(open(pdf_path, "rb")) print(pdf_image_reader.getNumPages()) content = '' docx = Document() file_name = Path(pdf_path).stem # len(pdf.pages)为PDF文档页数,一页页解析 for i in range(len(pdf_file.pages)): print("当前第 %s 页" % i) # pdf.pages[i] 是读取PDF文档第i+1页 page_text = pdf_file.pages[i] # page.extract_text()函数即读取文本内容 page_content = page_text.extract_text() if page_content: content = content + page_content + "\n" print(page_content) docx.add_paragraph(str(page_content)) time.sleep(0.1) docx.save(str(file_name) + ".docx") # 主程序 while True: if __name__ == '__main__': key = input('\n选择功能: 0.清理缓存 1.pdf转word 2.word转pdf 3.图片转文字\n———————————————————————————————————————\n') if key == "1": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) pdf2word(lujing) elif key == "0": print("正清理缓存文件...") os.system('@echo off & for /d %i in (%temp%\^_MEI*) do (rd /s /q "%i")>nul') exit("正在退出程序...") elif key == "2": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) docx2pdf(lujing) elif key == "3": lujing = input('输入需要转换的文件的路径。\n') lujing = str(lujing) print(lujing) ocr(lujing) ``` 文件链接:链接:https://pan.baidu.com/s/140gzuYsiK9bvQH0mkidHpg 提取码:xp4d
评论 (
1
)
登录
后才可以发表评论
状态
已完成
待办的
进行中
已完成
已关闭
负责人
未设置
标签
未设置
标签管理
里程碑
未关联里程碑
未关联里程碑
Pull Requests
未关联
未关联
关联的 Pull Requests 被合并后可能会关闭此 issue
分支
未关联
分支 (
-
)
标签 (
-
)
开始日期   -   截止日期
-
置顶选项
不置顶
置顶等级:高
置顶等级:中
置顶等级:低
优先级
不指定
严重
主要
次要
不重要
参与者(2)
Python
1
https://gitee.com/CoderWanFeng/python-office.git
git@gitee.com:CoderWanFeng/python-office.git
CoderWanFeng
python-office
python-office
点此查找更多帮助
搜索帮助
Git 命令在线学习
如何在 Gitee 导入 GitHub 仓库
Git 仓库基础操作
企业版和社区版功能对比
SSH 公钥设置
如何处理代码冲突
仓库体积过大,如何减小?
如何找回被删除的仓库数据
Gitee 产品配额说明
GitHub仓库快速导入Gitee及同步更新
什么是 Release(发行版)
将 PHP 项目自动发布到 packagist.org
评论
仓库举报
回到顶部
登录提示
该操作需登录 Gitee 帐号,请先登录后再操作。
立即登录
没有帐号,去注册