基于 PyTesseract 实现的 OCR 程序

前言

实习期间，应经理要求，将pdf文件内容中的指定文字、编号连同文件名称，填充进Excel表中。所提供pdf文件均为图片版，这就意味着不存在复制粘贴的可能，只能运用Quicker的OCR工具一个一个地手动识别内容，并将内容与文件名称粘贴至Excel表中。

然而，文件一共有三千多个。。。。

手动复制是不可能的，这辈子都不可能的！

实现逻辑

工具 / 主要第三方库

Python 3.7.7
PyCharm 2021.2
PyMuPDF 1.18.15
Pillow 7.1.2
OpenCV-Python 4.5.3.56
PyTesseract 0.3.8
Pandas 1.0.3

实现思路

通过PyMuPDF库的fitz模块将图片版PDF转为图片格式；
通过PIL库实现对图片文件灰度化（因为我们需要用Tesseract进行文字识别，而Tesseract对彩色图片识别的支持不是很友好…）；
通过OpenCV对灰度图进行坐标定位，从而实现特定区域裁剪；
最后通过Tesseract对裁剪好的灰度图进行文字识别；
通过Pandas对所识别内容与文件名称所匹配，并填充进Excel中。

那么，思路具备，编码开始！

源码

在编写代码前最好先新建四个文件夹，分别用于存放PDF原文件、原图片、灰度图、所裁剪灰度图。以便在调试时及时定位bug，有效找出问题所在。思路清晰完善后再取消文件夹优化代码也不迟。

版本一

'''
OCR_Pro_1
此版本程序逻辑为，面对所有pdf文件分步执行程序流程。

邓小闲
'''

import os
import fitz #PDF转图片
from PIL import Image #图片灰度化
from matplotlib import pyplot as plt #绘制灰度图
import numpy as np
import cv2 #灰度图裁剪
import pytesseract #OCR
import re #去除所识别文字中冗余字符串
import pandas as pd
import time

start = time.time()

# PDF文件转图片
def pdf_to_img(file_pdf, file_img_ori):
    print("正在将PDF文件转换成图片......")
    cons = [] #用于存放文件名称
    pdfs = os.listdir(file_pdf)
    for pdf in pdfs:
        # 打开PDF
        pdf_doc = fitz.open(file_pdf + "\\" + pdf)
        # 提取首页，见【注】
        page = pdf_doc[0]
        zoom_x = 2
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.getPixmap(matrix=mat, alpha=False)
        # 对转换格式后的图片命名————按原名命名
        Name = pdf[pdf.rfind("\\") + 1 : pdf.rfind(".")]
        image_ori = file_img_ori + "\\" + Name + ".png"
        pix.writePNG(image_ori)
        # 提取文件名称至字典备用。
        cons.append(Name)
    dir_1 = {"文件名称":cons}
    return dir_1


# 图片灰度化
def img_gary(file_img_ori, file_img_gray):
    print("正在进行图片灰度化......")
    imgs_ori = os.listdir(file_img_ori)
    for img_ori in imgs_ori:
        # 打开图片
        im = Image.open(file_img_ori + "\\" + img_ori)
        # 灰度化处理
        im_gray = im.convert("L")
        im_arr = np.array(im_gray)
        # 将像素值求平方，使较暗的像素值变得更小
        im_1 = 255.0 * (im_arr / 255.0) ** 2
        plt.axis("off")
        plt.imshow(Image.fromarray(im_1), cmap='gray')
        # 对灰度图命名————按原名命名
        Name = img_ori[img_ori.rfind("\\") + 1 : img_ori.rfind(".")]
        img_gray = file_img_gray + "\\" + Name + ".png"
        # 手动设置分辨率，默认分辨率太低了.......
        plt.savefig(img_gray, dpi=400)


# 裁剪灰度图特定区域
def img_cropping(file_img_gray, file_img_gray_cropped):
    print("正在进行特定区域裁剪......")
    imgs_gray = os.listdir(file_img_gray)
    for img_gray in imgs_gray:
        img = cv2.imdecode(np.fromfile(file_img_gray + "\\" + img_gray, dtype=np.uint8), -1)
        # 指定区域1
        # img[]中填充指定区域的左上点(start)，右下点(end)坐标，以确定裁剪范围
        # 本人使用PS确定的指定区域坐标，PS-->窗口-->信息，即可查看
        cropImg_1 = img[y start:y end, x start:x end] 
        # 对裁剪图1命名————按原名命名
        Name_1 = img_gray[img_gray.rfind("\\") + 1: img_gray.rfind(".")] + "_1"
        img_cropped_1 = file_img_gray_cropped + "\\" + Name_1 + ".png"
        cv2.imwrite(img_cropped_1, cropImg_1)
        # 指定区域2
        cropImg_2 = img[y start:y end, x start:x end]
        # 对裁剪图2命名————按原名命名
        Name_2 = img_gray[img_gray.rfind("\\") + 1 : img_gray.rfind(".")] + "_2"
        img_cropped_2 = file_img_gray_cropped + "\\" + Name_2 + ".png"
        cv2.imwrite(img_cropped_2, cropImg_2)


# 文字识别，并将所识别文字添加进DataFrame中，同时创建Excel
def ocr(file_img_gray_cropped, dir_1, path_excel, excelName):
    print("正在进行文字识别......")
    ocr_1 = [] #存放指定区域1识别内容
    ocr_2 = [] #存放指定区域2识别内容
    rstr = r"[\=\(\)\,\/\\\:\*\?\"\<\>\|\' '\\\n\\\x0c]"
    imgs_cropped = os.listdir(file_img_gray_cropped)
    for img_cropped in imgs_cropped:
        if img_cropped.endswith("_1.png"):
            # 须指定中文语言包，否则无法识别中文。同时去除所识别文字中\n、\0xc等冗余字符
            text_1 = pytesseract.image_to_string(Image.open(file_img_gray_cropped + "\\" + img_cropped), lang='chi_sim')
            text_1 = re.sub(rstr, "", text_1)
            ocr_1.append(text_1)
        elif img_cropped.endswith("_2.png"):
            text_2 = pytesseract.image_to_string(Image.open(file_img_gray_cropped + "\\" + img_cropped))
            text_2 = re.sub(rstr, "", text_2)
            ocr_2.append(text_2)

    dir_2 = {"所识别内容1":ocr_1, "所识别内容2":ocr_2}
    dir_1.update(dir_2)
    df = pd.DataFrame(dir_1)
    df.to_excel(path_excel + "\\" + excelName)


if __name__ == '__main__':
    # 存放pdf文件的文件夹路径
    file_pdf = r""
    # 存放原图的文件夹路径
    file_img_ori = r""
    # 存放灰度图的文件夹路径
    file_img_gray = r""
    # 存放裁剪后灰度图的文件夹路径
    file_img_gray_cropped = r""
    # 存放Excel表的文件夹路径
    path_excel = r""
    # Excel的文件夹名称，包括后缀名！
    excelName = r""
    dir_1 = pdf_to_img(file_pdf, file_img_ori)
    print("文件格式已转换成功")
    print("="*130)
    img_gary(file_img_ori, file_img_gray)
    print("图片已灰度化成功")
    print("="*130)
    img_cropping(file_img_gray, file_img_gray_cropped)
    print("灰度图已裁剪成功")
    print("="*130)
    ocr(file_img_gray_cropped, dir_1, path_excel, excelName)
    print("文字识别成功，并已添加进Excel中")
    print("=" * 130)

    # 程序运行时间
    end = time.time()
    print("程序运行时间为", end-start, "秒")

注

由于在我的工作内容中，所需信息都在PDF文件首页，故pdf_to_img()只需提取第一页内容，若想将pdf各页面全部提取，则代码为：

def pdf_to_img(file_pdf, file_img_ori):
    pdfs = os.listdir(file_pdf)
    for pdf in pdfs:
        Name = pdf[pdf.rfind("\\") + 1 : pdf.rfind(".")]
        pdf_doc = fitz.open(file_pdf + "\\" + pdf)
        for pg in range(pdf_doc.pageCount):
			page = pdf_doc[pg]
			zoom_x = 2
            zoom_y = 2
            mat = fitz.Matrix(zoom_x, zoom_y)
            pix = page.getPixmap(matrix=mat, alpha=False)
            image_ori = file_img_ori + "\\" + Name + "-%i.png" % pg
            pix.writePNG(image_ori)

版本二

'''
OCR_Pro_2
此版本程序逻辑为，对每个pdf文件轮流执行全部流程。

邓小闲
'''

import os
import fitz
from PIL import Image
from matplotlib import pyplot as plt
import numpy as np
import cv2
import pytesseract
import pandas as pd
import re
import time

start = time.time()

def ocr(file_pdf, file_img_ori, file_img_gray, file_img_gray_cropped, path_excel, excelName):
    cons = []
    ocr_1 = []
    ocr_2 = []
    pdfs = os.listdir(file_pdf)
    for pdf in pdfs:
        ser = pdfs.index(pdf) + 1
        # PDF转图片
        Name = pdf[pdf.rfind("\\") + 1: pdf.rfind(".")]
        pdf_doc = fitz.open(file_pdf + "\\" + pdf)
        page = pdf_doc[0]
        zoom_x = 2
        zoom_y = 2
        mat = fitz.Matrix(zoom_x, zoom_y)
        pix = page.getPixmap(matrix=mat, alpha=False)
        img_ori = file_img_ori + "\\" + Name + ".png"
        pix.writePNG(img_ori)
        print("第%s个PDF文件 ————"%ser, pdf, "已完成文件格式转换")

        # 图片灰度化
        im = Image.open(img_ori)
        im_gray = im.convert("L")
        im_arr = np.array(im_gray)
        im_1 = 255.0 * (im_arr / 255.0)
        plt.axis("off")
        plt.imshow(Image.fromarray(im_1), cmap='gray')
        img_gray = file_img_gray + "\\" + Name + ".png"
        plt.savefig(img_gray, dpi=400)
        print("第%s个PDF文件 ————"%ser, pdf, "已完成图片灰度化")

        # 裁剪灰度图特定区域
        img = cv2.imdecode(np.fromfile(img_gray, dtype=np.uint8), -1)
        # 指定区域1
        cropImg_1 = img[y start:y end, x start:x end]
        img_cropped_1 = file_img_gray_cropped + "\\" + Name + "_1" + ".png"
        cv2.imwrite(img_cropped_1, cropImg_1)
        # 指定区域2
        cropImg_2 = img[y start:y end, x start:x end]
        img_cropped_2 = file_img_gray_cropped + "\\" + Name + "_2" + ".png"
        cv2.imwrite(img_cropped_2, cropImg_2)
        print("第%s个PDF文件 ————"%ser, pdf, "已完成指定区域裁剪")

        # 文字识别
        text_1 = pytesseract.image_to_string(Image.open(img_cropped_1), lang='chi_sim')
        text_2 = pytesseract.image_to_string(Image.open(img_cropped_2))
        # 去除所识别文字中\n、\0xc等冗余字符
        rstr = r"[\=\(\)\,\/\\\:\*\?\"\<\>\|\' '\\\n\\\x0c]“
        i = re.sub(rstr, "", text_1)
        ocr_1.append(i)
        j = re.sub(rstr, "", text_2)
        ocr_2.append(j)

        # 添加文件名称
        cons.append(Name)
        print("第%s个PDF文件 ————"%ser, pdf, "已完成文字识别，并成功添加进字典")
        print("="*130)

    dir = {"文件名称": cons, "所识别内容1":ocr_1, "所识别内容2":ocr_2}
    df = pd.DataFrame(dir)
    df.to_excel(path_excel + "\\" + excelName)


if __name__ == '__main__':
    file_pdf = r""
    file_img_ori = r""
    file_img_gray = r""
    file_img_gray_cropped = r""
    path_excel = r""
    excelName = r""
    ocr(file_pdf, file_img_ori, file_img_gray, file_img_gray_cropped, path_excel, excelName)

    end = time.time()
    print("程序运行时间为", end-start, "秒")