'double text recognition with opencv and pytesseract
I have these codes to recognize text from a pdf file but when returning text, some paragraphs are duplicated. Does anyone know why this can happen?
I know that I have duplicate line_items_coordinates but I don't know how I can solve it.
import cv2
from PIL import Image
def mark_region(image_path):
"""
It takes an image path as input, and returns a list of coordinates of the bounding boxes of the line
items
:param image_path: The path to the image you want to extract text from
"""
im = cv2.imread(image_path)
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (9,9), 0)
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30)
# Dilate to combine adjacent text contours
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9))
dilate = cv2.dilate(thresh, kernel, iterations=4)
# Find contours, highlight text areas, and extract ROIs
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
line_items_coordinates = []
for c in cnts:
area = cv2.contourArea(c)
x,y,w,h = cv2.boundingRect(c)
if y >= 300 and x <= 1000:
if area > 10000:
image = cv2.rectangle(im, (x,y), (2600, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2600, y+h)])
if y >= 2400 and x<= 2000:
image = cv2.rectangle(im, (x,y), (2600, y+h), color=(255,0,255), thickness=3)
line_items_coordinates.append([(x,y), (2600, y+h)])
watch_image(image) # Solo para ver que todo funcione correctamente
return image, line_items_coordinates, im
import pytesseract
import cv2
def get_text_from_region(image, c):
"""
- We take the image and the coordinates of the region we want to extract text from.
- We crop the image to the region we want to extract text from.
- We convert the image to black and white for better OCR.
- We use pytesseract to extract text from the image.
- We return the text
:param image: The image to be processed
:param c: the coordinates of the region of interest
"""
# cropping image img = image[y0:y1, x0:x1]
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]]
# convert the image to black and white for better OCR
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY)
# pytesseract image to string to get results
text = str(pytesseract.image_to_string(thresh1, config='--psm 6'))
return text
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
| Solution | Source |
|---|
