'Multipage tiff to a combined hocr output
I am looking for the option to process multipage tiff image to single HOCR file. A combined output of multipage tiff.
At present I am getting HOCR as different hocr pages.
from PIL import Image
import pytesseract as pt
import os
pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
# path for the folder for getting the raw images
path = "D:\\input"
# path for the folder for getting the output
tempPath = "D:\\output"
# iterating the images inside the folder
for imageName in os.listdir(path):
# only images
if imageName.lower().endswith(('.tiff', '.jpg', '.png')):
print(imageName)
inputPath = os.path.join(path, imageName)
img = Image.open(inputPath)
page = 0
while True:
try:
img.seek(page)
text = pt.image_to_pdf_or_hocr(img, extension='hocr', config=(r'--oem 3 --psm 6'), lang="eng")
print('page...', page)
page += 1
fullTempPath = os.path.join(tempPath, f"time_{imageName}_{page}.hocr")
#print(text)
# saving the text for every image in a separate .hocr file
file1 = open(fullTempPath, "wb")
file1.write(text)
file1.close()
except EOFError:
# Not enough frames in img
break
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
