Chuanshuoge: opencv 32 reading book

perspective transform to straighten the book

adaptive threshold to highlight text

detect texts with tesseract

reorder sentences into paragraph

read paragraph with text to speech

#book_reader.py

import cv2

import numpy as np

import imutils

import pytesseract

#import pyttsx3

from gtts import gTTS

from playsound import playsound

import subprocess

import os

pytesseract.pytesseract.tesseract_cmd = "C:\\Program Files\\Tesseract-OCR\\tesseract.exe"

def speak(audioString):

print(audioString)

tts = gTTS(text=audioString, lang='en')

tts.save("assets/audio.mp3")

wmp = r"C:\Program Files (x86)\Windows Media Player\wmplayer.exe"

media_file = os.path.abspath(os.path.relpath("assets/audio.mp3"))

p = subprocess.call([wmp, media_file])

playsound("audio.mp3")

#engine = pyttsx3.init()

#engine.say(audioString)

#engine.runAndWait()

img = cv2.imread("assets/ikea.jpg")

h, w, c = img.shape

relative_w = 1500

relative_h = int(relative_w / w * h)

img = cv2.resize(img, (relative_w, relative_h))

img_copy = img.copy()

gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

gray = cv2.GaussianBlur(gray, (7, 7), 0)

# perform edge detection, then perform a dilation + erosion to

# close gaps in between object edges

edged = cv2.Canny(gray, 100, 200)

edged = cv2.dilate(edged, None, iterations=1)

edged = cv2.erode(edged, None, iterations=1)

# find contours in the edge map

cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,

cv2.CHAIN_APPROX_SIMPLE)

cnts = imutils.grab_contours(cnts)

# find contour with biggest area

largest_contour = None

for c in cnts:

largest = 0

area = cv2.contourArea(c)

if area > largest:

largest = area

largest_contour = c

box = cv2.minAreaRect(largest_contour)

box = cv2.cv.BoxPoints(box) if imutils.is_cv2() else cv2.boxPoints(box)

box = np.array(box, dtype="int")

cv2.drawContours(img_copy, [box.astype("int")], -1, (0, 255, 0), 2)

box_frame = np.float32(box)

img_frame = np.float32([[0, relative_h], [0, 0], [relative_w, 0], [relative_w, relative_h]])

matrix = cv2.getPerspectiveTransform(box_frame, img_frame)

straight_img = cv2.warpPerspective(img, matrix, (relative_w, relative_h))

gray2 = cv2.cvtColor(straight_img, cv2.COLOR_BGR2GRAY)

adaptive_threshold = cv2.adaptiveThreshold(gray2, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)

adaptive_threshold_inv = cv2.bitwise_not(adaptive_threshold)

median_blur = cv2.medianBlur(adaptive_threshold, 3)

img_rgb = cv2.cvtColor(median_blur, cv2.COLOR_GRAY2RGB)

hImg, wImg, _ = img_rgb.shape

boxes = pytesseract.image_to_data(img_rgb)

print(boxes)

#record sentences from detected texts

blocks = []

previous_x = 0

block = []

for i, b in enumerate(boxes.splitlines()):

if i != 0:

b = b.split()

if len(b) == 12:

x, y, w, h = (int(b[6]), int(b[7]), int(b[8]), int(b[9]))

cv2.rectangle(straight_img, (x, y), (w + x, h + y), (0, 0, 255), 1)

cv2.putText(straight_img, b[11], (x, y), cv2.FONT_HERSHEY_COMPLEX, 0.5, (50, 50, 255), 2)

if abs(x - previous_x) > 300:

blocks.append({'block': block, 'x': previous_x})

block = []

block.append(b[11])

previous_x = x

blocks.append({'block': block, 'x': previous_x})

print(blocks)

block_length = len(blocks)

for i in range(0, block_length):

for j in range(i+1, block_length):

if abs(blocks[i]['x'] - blocks[j]['x']) < 300:

removed = blocks.pop(j)

blocks.insert(i + 1, removed)

break

print(blocks)

#reorder sentences into graph

paragraph = ""

previous_x = 0

for i, b in enumerate(blocks):

if abs(b['x'] - previous_x) < 300:

paragraph = paragraph + " ".join(b['block']) + "\n"

else:

paragraph = paragraph + " ".join(b['block']) + " "

previous_x = b['x']

print(paragraph)

cv2.imshow("picture", img_copy)

#cv2.imshow("canny edge", edged)

cv2.imshow("straight image", straight_img)

cv2.imshow("median blur", median_blur)

cv2.waitKey(0)

speak(paragraph)

------------------------------------

#logs

#detected words table

level page_num block_num par_num line_num word_num left top width height conf text

1 1 0 0 0 0 0 0 1500 1125 -1

2 1 1 0 0 0 180 997 576 24 -1

3 1 1 1 0 0 180 997 576 24 -1

4 1 1 1 1 0 180 997 576 24 -1

5 1 1 1 1 1 180 1009 28 12 85 SEE

5 1 1 1 1 2 215 1008 34 12 85 OUR

5 1 1 1 1 3 257 993 77 35 95 BIRTHDAY

5 1 1 1 1 4 345 993 59 35 91 OFFERS

5 1 1 1 1 5 424 1013 2 3 7 "

5 1 1 1 1 6 730 1000 26 11 5 ot

2 1 2 0 0 0 499 1006 433 52 -1

3 1 2 1 0 0 499 1006 433 52 -1

4 1 2 1 1 0 499 1006 433 29 -1

5 1 2 1 1 1 499 1012 40 23 30 THE

5 1 2 1 1 2 546 1012 62 20 96 PRICES

5 1 2 1 1 3 614 1019 21 12 91 IN

5 1 2 1 1 4 641 1018 42 12 91 THIS

5 1 2 1 1 5 690 1017 104 13 92 CATALOGUE

5 1 2 1 1 6 801 1016 36 12 92 CAN

5 1 2 1 1 7 843 1006 49 22 96 ONLY

5 1 2 1 1 8 898 1006 34 22 96 GET

4 1 2 1 2 0 506 1036 395 22 -1

5 1 2 1 2 1 506 1040 61 12 95 LOWER

5 1 2 1 2 2 574 1027 51 35 73 UNTIL

5 1 2 1 2 3 633 1027 39 35 73 JULY

5 1 2 1 2 4 681 1037 20 13 79 31

5 1 2 1 2 5 709 1037 49 15 79 2019,

5 1 2 1 2 6 765 1037 59 12 96 NEVER

5 1 2 1 2 7 830 1036 71 12 95 HIGHER

2 1 3 0 0 0 170 1022 204 47 -1

3 1 3 1 0 0 174 1022 200 43 -1

4 1 3 1 1 0 176 1022 190 20 -1

5 1 3 1 1 1 176 1028 19 12 71 AT

5 1 3 1 1 2 206 1015 28 34 71 THE

5 1 3 1 1 3 240 1025 69 15 93 BACK

5 1 3 1 1 4 291 1015 17 34 93 OF

5 1 3 1 1 5 316 1022 50 20 15 THE.

4 1 3 1 2 0 170 1041 204 28 -1

5 1 3 1 2 1 170 1041 100 28 94 CATALOGUE

5 1 3 1 2 2 349 1042 25 3 17 mo

2 1 4 0 0 0 0 0 1500 1125 -1

3 1 4 1 0 0 0 0 1500 1125 -1

4 1 4 1 1 0 0 0 1500 1125 -1

5 1 4 1 1 1 0 0 1500 1125 95

#sentence

[{'block': ['SEE', 'OUR', 'BIRTHDAY', 'OFFERS', '"'], 'x': 424}, {'block': ['ot', 'THE', 'PRICES', 'IN', 'THIS', 'CATALOGUE', 'CAN', 'ONLY', 'GET'], 'x': 898}, {'bloc

k': ['LOWER', 'UNTIL', 'JULY', '31', '2019,', 'NEVER', 'HIGHER'], 'x': 830}, {'block': ['AT', 'THE', 'BACK', 'OF', 'THE.', 'CATALOGUE', 'mo'], 'x': 349}]

#ordered sentence

[{'block': ['SEE', 'OUR', 'BIRTHDAY', 'OFFERS', '"'], 'x': 424}, {'block': ['AT', 'THE', 'BACK', 'OF', 'THE.', 'CATALOGUE', 'mo'], 'x': 349}, {'block': ['ot', 'THE',

'PRICES', 'IN', 'THIS', 'CATALOGUE', 'CAN', 'ONLY', 'GET'], 'x': 898}, {'block': ['LOWER', 'UNTIL', 'JULY', '31', '2019,', 'NEVER', 'HIGHER'], 'x': 830}]

#paragraph

SEE OUR BIRTHDAY OFFERS " AT THE BACK OF THE. CATALOGUE mo

ot THE PRICES IN THIS CATALOGUE CAN ONLY GET LOWER UNTIL JULY 31 2019, NEVER HIGHER

reference:

perspective transform

http://chuanshuoge2.blogspot.com/2021/04/opencv-31-perspective-transform.html

adaptive threshold

http://chuanshuoge2.blogspot.com/2021/03/opencv-13-adaptive-thresholding.html

text to speech

https://chuanshuoge2.blogspot.com/2021/02/python-speech-recognition.html