代码拉取完成,页面将自动刷新
# 抓取并读取网页pdf
# pdf READ operation
from urllib.request import urlopen
from urllib.error import URLError
from urllib.error import HTTPError
from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO, open
import sys
import os
# 也可以读取由pdffile=open("../../readme.pdf")语句打开的本地文件。
url = sys.argv[1]
# url = 'http://www.ynhtbank.com/ynhtyh/resource/cms/article/sub295208/378927/2018072715111046526.pdf'
def readPDF(filename):
resmgr = PDFResourceManager()
retstr = StringIO()
laparams = LAParams()
device = TextConverter(resmgr, retstr, laparams=laparams)
process_pdf(resmgr, device, filename)
device.close()
content = retstr.getvalue()
retstr.close()
return content
try:
pdffile = urlopen(url)
except (URLError, HTTPError) as e:
print("Errors:\n")
print(e)
# 写到文件pdftext.txt中
if os.path.exists(r'C:\PythonWorkspace/pdftext.txt'):
os.remove('C:\PythonWorkspace/pdftext.txt')
outputString = readPDF(pdffile)
with open('C:\PythonWorkspace/pdftext.txt', 'a', encoding='utf-8') as f:
f.write(''.join(outputString))
pdffile.close()
# 输出到console控制台
# outputString = readPDF(pdffile)
# print(outputString)
# pdffile.close()
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。