基于上篇文章,识别后另存为word文档版本
Python源码:
from docx import Document
import pytesseract
import cv2
# 设置Tesseract路径(确保已安装Tesseract OCR并设置正确路径)
pytesseract.pytesseract.tesseract_cmd = r'D:\img_ocr\tesseract.exe'
def extract_text_from_image_and_save_to_word(image_path, output_word):
"""
从图片中提取文本内容并保存到Word文件
:param image_path: 输入图片路径
:param output_word: Word输出路径
"""
# 读取图片
img = cv2.imread(image_path)
# 转换为灰度图像
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 二值化处理(提高对比度,便于OCR识别)
_, binary = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
# 使用Tesseract提取文本内容
extracted_text = pytesseract.image_to_string(binary, config='--psm 6')
# 创建Word文档
doc = Document()
# 将提取的文本添加到Word文档
doc.add_paragraph(extracted_text)
# 保存到指定路径
doc.save(output_word)
print(f"文本已成功导出到 {output_word}")
# 示例调用
image_path = '123.png' # 替换为你的图片路径
output_word = 'output2.docx' # Word输出路径
extract_text_from_image_and_save_to_word(image_path, output_word)