perspective transform to straighten the book
adaptive threshold to highlight text
detect texts with tesseract
reorder sentences into paragraph
read paragraph with text to speech
#book_reader.pyimport cv2
import numpy as np
import imutils
import pytesseract
#import pyttsx3
from gtts import gTTS
from playsound import playsound
import subprocess
import os
pytesseract.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"
def speak(audioString):
print(audioString)
tts = gTTS(text=audioString, lang='en')
tts.save("assets/audio.mp3")
wmp = r"C:\Program Files (x86)\Windows Media Player\wmplayer.exe"
media_file = os.path.abspath(os.path.relpath("assets/audio.mp3"))
p = subprocess.call([wmp, media_file])
playsound("audio.mp3")
#engine = pyttsx3.init()
#engine.say(audioString)
#engine.runAndWait()
img = cv2.imread("assets/ikea.jpg")
h, w, c = img.shape
relative_w = 1500
relative_h = int(relative_w / w * h)
img = cv2.resize(img, (relative_w, relative_h))
img_copy = img.copy()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
gray = cv2.GaussianBlur(gray, (7, 7), 0)
# perform edge detection, then perform a dilation + erosion to
# close gaps in between object edges
edged = cv2.Canny(gray, 100, 200)
edged = cv2.dilate(edged, None, iterations=1)
edged = cv2.erode(edged, None, iterations=1)
# find contours in the edge map
cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
cv2.CHAIN_APPROX_SIMPLE)
cnts = imutils.grab_contours(cnts)
# find contour with biggest area
largest_contour = None
for c in cnts:
largest = 0
area = cv2.contourArea(c)
if area > largest:
largest = area
largest_contour = c
box = cv2.minAreaRect(largest_contour)
box = cv2.cv.BoxPoints(box) if imutils.is_cv2() else cv2.boxPoints(box)
box = np.array(box, dtype="int")
cv2.drawContours(img_copy, [box.astype("int")], -1, (0, 255, 0), 2)
box_frame = np.float32(box)
img_frame = np.float32([[0, relative_h], [0, 0], [relative_w, 0], [relative_w, relative_h]])
matrix = cv2.getPerspectiveTransform(box_frame, img_frame)
straight_img = cv2.warpPerspective(img, matrix, (relative_w, relative_h))
gray2 = cv2.cvtColor(straight_img, cv2.COLOR_BGR2GRAY)
adaptive_threshold = cv2.adaptiveThreshold(gray2, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
adaptive_threshold_inv = cv2.bitwise_not(adaptive_threshold)
median_blur = cv2.medianBlur(adaptive_threshold, 3)
img_rgb = cv2.cvtColor(median_blur, cv2.COLOR_GRAY2RGB)
hImg, wImg, _ = img_rgb.shape
boxes = pytesseract.image_to_data(img_rgb)
print(boxes)
#record sentences from detected texts
blocks = []
previous_x = 0
block = []
for i, b in enumerate(boxes.splitlines()):
if i != 0:
b = b.split()
if len(b) == 12:
x, y, w, h = (int(b[6]), int(b[7]), int(b[8]), int(b[9]))
cv2.rectangle(straight_img, (x, y), (w + x, h + y), (0, 0, 255), 1)
cv2.putText(straight_img, b[11], (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.5, (50, 50, 255), 2)
if abs(x - previous_x) > 300:
blocks.append({'block': block, 'x': previous_x})
block = []
block.append(b[11])
previous_x = x
blocks.append({'block': block, 'x': previous_x})
print(blocks)
block_length = len(blocks)
for i in range(0, block_length):
for j in range(i+1, block_length):
if abs(blocks[i]['x'] - blocks[j]['x']) < 300:
removed = blocks.pop(j)
blocks.insert(i + 1, removed)
break
print(blocks)
#reorder sentences into graph
paragraph = ""
previous_x = 0
for i, b in enumerate(blocks):
if abs(b['x'] - previous_x) < 300:
paragraph = paragraph + " ".join(b['block']) + "\n"
else:
paragraph = paragraph + " ".join(b['block']) + " "
previous_x = b['x']
print(paragraph)
cv2.imshow("picture", img_copy)
#cv2.imshow("canny edge", edged)
cv2.imshow("straight image", straight_img)
cv2.imshow("median blur", median_blur)
cv2.waitKey(0)
speak(paragraph)
------------------------------------
#logs
#detected words table
level page_num block_num par_num line_num word_num left top width height conf text
1 1 0 0 0 0 0 0 1500 1125 -1
2 1 1 0 0 0 180 997 576 24 -1
3 1 1 1 0 0 180 997 576 24 -1
4 1 1 1 1 0 180 997 576 24 -1
5 1 1 1 1 1 180 1009 28 12 85 SEE
5 1 1 1 1 2 215 1008 34 12 85 OUR
5 1 1 1 1 3 257 993 77 35 95 BIRTHDAY
5 1 1 1 1 4 345 993 59 35 91 OFFERS
5 1 1 1 1 5 424 1013 2 3 7 "
5 1 1 1 1 6 730 1000 26 11 5 ot
2 1 2 0 0 0 499 1006 433 52 -1
3 1 2 1 0 0 499 1006 433 52 -1
4 1 2 1 1 0 499 1006 433 29 -1
5 1 2 1 1 1 499 1012 40 23 30 THE
5 1 2 1 1 2 546 1012 62 20 96 PRICES
5 1 2 1 1 3 614 1019 21 12 91 IN
5 1 2 1 1 4 641 1018 42 12 91 THIS
5 1 2 1 1 5 690 1017 104 13 92 CATALOGUE
5 1 2 1 1 6 801 1016 36 12 92 CAN
5 1 2 1 1 7 843 1006 49 22 96 ONLY
5 1 2 1 1 8 898 1006 34 22 96 GET
4 1 2 1 2 0 506 1036 395 22 -1
5 1 2 1 2 1 506 1040 61 12 95 LOWER
5 1 2 1 2 2 574 1027 51 35 73 UNTIL
5 1 2 1 2 3 633 1027 39 35 73 JULY
5 1 2 1 2 4 681 1037 20 13 79 31
5 1 2 1 2 5 709 1037 49 15 79 2019,
5 1 2 1 2 6 765 1037 59 12 96 NEVER
5 1 2 1 2 7 830 1036 71 12 95 HIGHER
2 1 3 0 0 0 170 1022 204 47 -1
3 1 3 1 0 0 174 1022 200 43 -1
4 1 3 1 1 0 176 1022 190 20 -1
5 1 3 1 1 1 176 1028 19 12 71 AT
5 1 3 1 1 2 206 1015 28 34 71 THE
5 1 3 1 1 3 240 1025 69 15 93 BACK
5 1 3 1 1 4 291 1015 17 34 93 OF
5 1 3 1 1 5 316 1022 50 20 15 THE.
4 1 3 1 2 0 170 1041 204 28 -1
5 1 3 1 2 1 170 1041 100 28 94 CATALOGUE
5 1 3 1 2 2 349 1042 25 3 17 mo
2 1 4 0 0 0 0 0 1500 1125 -1
3 1 4 1 0 0 0 0 1500 1125 -1
4 1 4 1 1 0 0 0 1500 1125 -1
5 1 4 1 1 1 0 0 1500 1125 95
#sentence
[{'block': ['SEE', 'OUR', 'BIRTHDAY', 'OFFERS', '"'], 'x': 424}, {'block': ['ot', 'THE', 'PRICES', 'IN', 'THIS', 'CATALOGUE', 'CAN', 'ONLY', 'GET'], 'x': 898}, {'bloc
k': ['LOWER', 'UNTIL', 'JULY', '31', '2019,', 'NEVER', 'HIGHER'], 'x': 830}, {'block': ['AT', 'THE', 'BACK', 'OF', 'THE.', 'CATALOGUE', 'mo'], 'x': 349}]
#ordered sentence
[{'block': ['SEE', 'OUR', 'BIRTHDAY', 'OFFERS', '"'], 'x': 424}, {'block': ['AT', 'THE', 'BACK', 'OF', 'THE.', 'CATALOGUE', 'mo'], 'x': 349}, {'block': ['ot', 'THE',
'PRICES', 'IN', 'THIS', 'CATALOGUE', 'CAN', 'ONLY', 'GET'], 'x': 898}, {'block': ['LOWER', 'UNTIL', 'JULY', '31', '2019,', 'NEVER', 'HIGHER'], 'x': 830}]
#paragraph
SEE OUR BIRTHDAY OFFERS " AT THE BACK OF THE. CATALOGUE mo
ot THE PRICES IN THIS CATALOGUE CAN ONLY GET LOWER UNTIL JULY 31 2019, NEVER HIGHER
reference:
perspective transform
adaptive threshold
text to speech
No comments:
Post a Comment