exe下载地址:
UB-Mannheim/tesseract: Tesseract Open Source OCR Engine (main repository) (github.com)
Index of /tesseract (uni-mannheim.de)
import fitz
import os
from PIL import Image
import pytesseract
def pdf():
doc=fitz.open('./pdfs/focus1.pdf')
out=open('./pdfs/focus1.txt','w')
for page in doc:
text=page.get_text().encode('utf-8')
print(text)
out.write(text)
out.write('\f') # 写入分页符作为字符串
out.close()
def OCR_demo():
for i in range(1, 12):
try:
image_path = './imgs/' + str(i) + '.png'
text_path = './imgs/' + str(i) + '.txt'
image = Image.open(image_path)
text = pytesseract.image_to_string(image)
with open(text_path, 'w', encoding='utf-8') as out:
out.write(text)
except Exception as e:
print(f"发生错误处理文件 {image_path}: {e}")
if __name__ == '__main__':
OCR_demo()