1 Star 0 Fork 0

庞博/python自动化相关

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
该仓库未声明开源许可证文件(LICENSE),使用请关注具体项目描述及其代码上游依赖。
克隆/下载
5.py 2.21 KB
一键复制 编辑 原始数据 按行查看 历史
庞博 提交于 2021-11-24 15:20 . 第一次提交
import os
from configparser import ConfigParser
from io import StringIO
from io import open
from concurrent.futures import ProcessPoolExecutor
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from docx import Document
def read_from_pdf(file_path):
with open(file_path, 'rb') as file:
resource_manager = PDFResourceManager()
return_str = StringIO()
lap_params = LAParams()
device = TextConverter(
resource_manager, return_str, laparams=lap_params)
process_pdf(resource_manager, device, file)
device.close()
content = return_str.getvalue()
return_str.close()
return content
def save_text_to_word(content, file_path):
doc = Document()
for line in content.split('\n'):
paragraph = doc.add_paragraph()
paragraph.add_run(remove_control_characters(line))
doc.save(file_path)
def remove_control_characters(content):
mpa = dict.fromkeys(range(32))
return content.translate(mpa)
def pdf_to_word(pdf_file_path, word_file_path):
content = read_from_pdf(pdf_file_path)
save_text_to_word(content, word_file_path)
def main():
config_parser = ConfigParser()
config_parser.read('config.cfg')
config = config_parser['default']
# tasks = []
# with ProcessPoolExecutor(max_workers=int(config['max_worker'])) as executor:
for file in os.listdir(config['pdf_folder']):
extension_name = os.path.splitext(file)[1]
if extension_name != '.pdf':
continue
file_name = os.path.splitext(file)[0]
pdf_file = config['pdf_folder'] + '/' + file
word_file = config['word_folder'] + '/' + file_name + '.docx'
print('正在处理: ', file)
pdf_to_word(pdf_file, word_file)
# result = executor.submit(pdf_to_word, pdf_file, word_file)
# tasks.append(result)
# while True:
# exit_flag = True
# for task in tasks:
# if not task.done():
# exit_flag = False
# if exit_flag:
# print('完成')
exit(0)
if __name__ == '__main__':
main()
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
Python
1
https://gitee.com/pangbo2016/pythonAutoWork.git
git@gitee.com:pangbo2016/pythonAutoWork.git
pangbo2016
pythonAutoWork
python自动化相关
master

搜索帮助