'Multipage tiff to a combined hocr output

I am looking for the option to process multipage tiff image to single HOCR file. A combined output of multipage tiff.

At present I am getting HOCR as different hocr pages.

from PIL import Image
import pytesseract as pt
import os

pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
     
# path for the folder for getting the raw images
path = "D:\\input"

# path for the folder for getting the output
tempPath = "D:\\output"

# iterating the images inside the folder
for imageName in os.listdir(path):
 
    # only images   
    if imageName.lower().endswith(('.tiff', '.jpg', '.png')):
        print(imageName)
        
        inputPath = os.path.join(path, imageName)
        img = Image.open(inputPath)
    
        page = 0
        while True:
            try:
        
                img.seek(page)
                text = pt.image_to_pdf_or_hocr(img, extension='hocr', config=(r'--oem 3 --psm 6'), lang="eng")
        
                print('page...', page)
                page += 1
         
                fullTempPath = os.path.join(tempPath, f"time_{imageName}_{page}.hocr")
                #print(text)
        
                # saving the text for every image in a separate .hocr file
                file1 = open(fullTempPath, "wb")
                file1.write(text)
                file1.close()
            except EOFError:
                # Not enough frames in img
                break

python ocr

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'Multipage tiff to a combined hocr output

Sources

Related Questions