python pdf提取图片并提取图片中的文字.md · 1264644959/blog

提取图片

使用的为 pymupdf 库为 fitz子模块

 pip install pymupdf -i http://pypi.douban.com/simple --trusted-host pypi.douban.com

开头安装程序所用到的文件

链接：https://pan.baidu.com/s/18d5X9O2BF8dm3gZI8p0J2w 
提取码：ob96

提取图片文字

感觉还是百度云识别的牛逼

调用百度智能云接口

每天 50000次够用

File:Tesseract.gif

tesseract图像识别

win安装使用教程

转载：

https://gitee.com/super__man/blog/blob/master/%E5%9B%BE%E4%B9%A6/python%20tesseract-ocr%20%E5%9B%BE%E6%96%87%E8%AF%86%E5%88%AB%20%EF%BC%88windows%E5%9F%BA%E7%A1%80%E7%8E%AF%E5%A2%83%E6%90%AD%E5%BB%BA%EF%BC%89%20-%20%E6%B8%85%E9%A3%8E%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95%20-%20%E5%8D%9A%E5%AE%A2%E5%9B%AD.pdf

问题1

pytesseract.pytesseract.TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.

需要修改 pytesseract.py ，没有找到 tesseract 程序

我使用的 conda 虚拟环境管理，上面是我的包路径

进入 .py 文件

将路径修改为你电脑 tesseract.exe 绝对路径

再次运行就不会报错了

代码

import os
import time

import fitz
import glob

import pytesseract
import tesserocr
from PIL import Image
from aip import AipOcr

class PdfHandle():
    #初始化
    def __init__(self):
        self.APP_ID = ''
        self.API_KEY = ''
        self.SECRET_KEY = '9HddR7LxTpEe3zhRTTU41DwbSKEOkQN0'
        self.img_path = './pdf/'
        self.imgs_list = []

    #pdf提取图片
    def fun1(self,filename):
        name = glob.glob(filename)[0]
        doc = fitz.open(name)
        for pg in range(0,doc.pageCount):
            page = doc[pg]
            zoom = int(1000)
            rotate = int(0)
            trans = fitz.Matrix(zoom / 100.0 ,zoom / 100.0).preRotate(rotate)
            pm = page.getPixmap(matrix=trans,alpha=0)
            pm.writePNG(self.img_path + '{}.png'.format(str(pg)))

    #调用百度api提取文字
    def tiqu(self):
        self.traverse(self.img_path)
        aip_orc = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)

        for file_path in self.imgs_list[0:1]:
            file_path = file_path[0]
            options = {}
            options['detect_direction'] = 'true'
            options['probability'] = 'true'
            id_card_side = 'front'
            result = aip_orc.basicAccurate(self.get_file_content(file_path), options)
            # print(result)
            for temp in result['words_result']:
                print(temp['words'])

    #调用百度api提取文字
    def tiqu2(self):
        self.traverse(self.img_path)
        aip_orc = AipOcr(self.APP_ID, self.API_KEY, self.SECRET_KEY)

        for file_path in self.imgs_list:
            file_path = file_path[0]
            print(file_path)
            options = {}
            options["language_type"] = "CHN_ENG"
            options["detect_direction"] = "true"
            options["detect_language"] = "true"
            options["probability"] = "true"
            while 1:
                try:

                    result = aip_orc.basicGeneral(self.get_file_content(file_path), options)
                    break
                except Exception as e:
                    time.sleep(2)

                    continue

            # print(result)
            for temp in result['words_result']:
                print(temp['words'])
            print('=========')

    #读取图片二进制内容
    def get_file_content(self,file_path):
        with open(file_path, 'rb') as f:
            return f.read()

    #遍历目录下的文件图片
    def traverse(self,d):
        dirs = os.listdir(d)  # 获取当前目录的文件列表

        for item in dirs:
            absPath = os.path.join(d, item)  # 拼接绝对路径
            if os.path.isdir(absPath):  # 判断是否是目录
                self.traverse(absPath)  # 递归调用函数
            else:
                if os.path.basename(absPath).endswith('.png'):
                    self.imgs_list.append((absPath,os.path.basename(absPath)))
    #使用 tesseract 识别文字
    def my_tesseract(self):
        img = Image.open(r'D:\bfy\my_code\task_3\pdf\6.png')
        print(pytesseract.image_to_string(img,lang='chi_sim'))


if __name__ == '__main__':
    pdf = PdfHandle()
    # pdf.my_tesseract()
    pdf.tiqu2()

1264644959 / blog

提取图片

提取图片文字

调用百度智能云接口

tesseract图像识别

win安装使用教程

问题1

代码

简介

发行版

贡献者

近期动态

1264644959 / blog .gitee-modal { width: 500px !important; }

提取图片

提取图片文字

调用百度智能云接口

tesseract图像识别

win安装使用教程

问题1

代码

简介

发行版

贡献者

近期动态

搜索帮助

1264644959 / blog