'double text recognition with opencv and pytesseract

I have these codes to recognize text from a pdf file but when returning text, some paragraphs are duplicated. Does anyone know why this can happen?

I know that I have duplicate line_items_coordinates but I don't know how I can solve it.

import cv2
from PIL import Image

def mark_region(image_path):
    """
    It takes an image path as input, and returns a list of coordinates of the bounding boxes of the line
    items
    
    :param image_path: The path to the image you want to extract text from
    """
    
    im = cv2.imread(image_path)

    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (9,9), 0)
    thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)

    # Dilate to combine adjacent text contours
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
    dilate = cv2.dilate(thresh, kernel, iterations=4)

    # Find contours, highlight text areas, and extract ROIs
    cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]

    line_items_coordinates = []
    for c in cnts:
        area = cv2.contourArea(c)
        x,y,w,h = cv2.boundingRect(c)

        if y >= 300 and x <= 1000:
            if area > 10000:
                image = cv2.rectangle(im, (x,y), (2600, y+h), color=(255,0,255), thickness=3)
                line_items_coordinates.append([(x,y), (2600, y+h)])

        if y >= 2400 and x<= 2000:
            image = cv2.rectangle(im, (x,y), (2600, y+h), color=(255,0,255), thickness=3)
            line_items_coordinates.append([(x,y), (2600, y+h)])

    watch_image(image) # Solo para ver que todo funcione correctamente
    return image, line_items_coordinates, im

import pytesseract
import cv2

def get_text_from_region(image, c):
    """
    - We take the image and the coordinates of the region we want to extract text from.
    - We crop the image to the region we want to extract text from.
    - We convert the image to black and white for better OCR.
    - We use pytesseract to extract text from the image.
    - We return the text
    
    :param image: The image to be processed
    :param c: the coordinates of the region of interest
    """

    # cropping image img = image[y0:y1, x0:x1]
    img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]    


    # convert the image to black and white for better OCR
    ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)

    # pytesseract image to string to get results
    text = str(pytesseract.image_to_string(thresh1, config='--psm 6'))
    return text

Sources

This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.

Source: Stack Overflow

Solution	Source

'double text recognition with opencv and pytesseract

Sources

Related Questions