1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
| ''' OCR_Pro_2 此版本程序逻辑为,对每个pdf文件轮流执行全部流程。
邓小闲 '''
import os import fitz from PIL import Image from matplotlib import pyplot as plt import numpy as np import cv2 import pytesseract import pandas as pd import re import time
start = time.time()
def ocr(file_pdf, file_img_ori, file_img_gray, file_img_gray_cropped, path_excel, excelName): cons = [] ocr_1 = [] ocr_2 = [] pdfs = os.listdir(file_pdf) for pdf in pdfs: ser = pdfs.index(pdf) + 1 Name = pdf[pdf.rfind("\\") + 1: pdf.rfind(".")] pdf_doc = fitz.open(file_pdf + "\\" + pdf) page = pdf_doc[0] zoom_x = 2 zoom_y = 2 mat = fitz.Matrix(zoom_x, zoom_y) pix = page.getPixmap(matrix=mat, alpha=False) img_ori = file_img_ori + "\\" + Name + ".png" pix.writePNG(img_ori) print("第%s个PDF文件 ————"%ser, pdf, "已完成文件格式转换")
im = Image.open(img_ori) im_gray = im.convert("L") im_arr = np.array(im_gray) im_1 = 255.0 * (im_arr / 255.0) plt.axis("off") plt.imshow(Image.fromarray(im_1), cmap='gray') img_gray = file_img_gray + "\\" + Name + ".png" plt.savefig(img_gray, dpi=400) print("第%s个PDF文件 ————"%ser, pdf, "已完成图片灰度化")
img = cv2.imdecode(np.fromfile(img_gray, dtype=np.uint8), -1) cropImg_1 = img[y start:y end, x start:x end] img_cropped_1 = file_img_gray_cropped + "\\" + Name + "_1" + ".png" cv2.imwrite(img_cropped_1, cropImg_1) cropImg_2 = img[y start:y end, x start:x end] img_cropped_2 = file_img_gray_cropped + "\\" + Name + "_2" + ".png" cv2.imwrite(img_cropped_2, cropImg_2) print("第%s个PDF文件 ————"%ser, pdf, "已完成指定区域裁剪")
text_1 = pytesseract.image_to_string(Image.open(img_cropped_1), lang='chi_sim') text_2 = pytesseract.image_to_string(Image.open(img_cropped_2)) rstr = r"[\=\(\)\,\/\\\:\*\?\"\<\>\|\' '\\\n\\\x0c]“ i = re.sub(rstr, "", text_1) ocr_1.append(i) j = re.sub(rstr, "", text_2) ocr_2.append(j)
# 添加文件名称 cons.append(Name) print("第%s个PDF文件 ————"%ser, pdf, "已完成文字识别,并成功添加进字典") print("="*130)
dir = {"文件名称": cons, "所识别内容1":ocr_1, "所识别内容2":ocr_2} df = pd.DataFrame(dir) df.to_excel(path_excel + "\\" + excelName)
if __name__ == '__main__': file_pdf = r"" file_img_ori = r"" file_img_gray = r"" file_img_gray_cropped = r"" path_excel = r"" excelName = r"" ocr(file_pdf, file_img_ori, file_img_gray, file_img_gray_cropped, path_excel, excelName)
end = time.time() print("程序运行时间为", end-start, "秒")
|