墨斋记-python ocr软件tesseract-ocr

exe下载地址：

UB-Mannheim/tesseract: Tesseract Open Source OCR Engine (main repository) (github.com)

import fitz
import os
from PIL import Image
import pytesseract

def pdf():
    doc=fitz.open('./pdfs/focus1.pdf')
    out=open('./pdfs/focus1.txt','w')
    for page in doc:
        text=page.get_text().encode('utf-8')
        print(text)
        out.write(text)
        out.write('\f')  # 写入分页符作为字符串
    out.close()
def OCR_demo():
    for i in range(1, 12):
        try:
            image_path = './imgs/' + str(i) + '.png'
            text_path = './imgs/' + str(i) + '.txt'
            
            image = Image.open(image_path)
            text = pytesseract.image_to_string(image)

            with open(text_path, 'w', encoding='utf-8') as out:
                out.write(text)
        except Exception as e:
            print(f"发生错误处理文件 {image_path}: {e}")

   
if __name__ == '__main__':
    
    OCR_demo()

墨斋记

Good Luck To You!

python ocr软件tesseract-ocr