私的AI研究会 > Tesseract3
実用的な AI開発に向けて、文字認識エンジン「Tesseract」(テッセラクト)を使用した「OCR アプリケーション」を開発する。
今回開発した OCR テストプログラム「ocrtest4.py」 を拡張する。
| コマンドオプション | デフォールト設定 | 意味 |
| -h, --help | ヘルプ表示 | |
| -i, --image | test1.png | 入力画像ファイル |
| -l, --language | jpn | 言語 |
| -c, --confidence | 40 | 有効とする信頼性スコア値 |
| -p, --process | n | 前処理(グレイスケール変換/2値化)フラグ (y/n) |
| -d, --linedel | n | 前処理(罫線消去)フラグ (y/b/n) b=バックグラウンド実行 |
| --layout | 6 | tesseractレイアウト(0-13) |
| --maxsize | 0 | 処理画像の最大ピクセル(0=リサイズしない) |
| --log | n | ログ出力フラグ (y/s/n) s=詳細ログ出力 |
| -t, --title | y | タイトル表示 (y/n) |
| -s, --speed | y | スピード計測表示 (y/n) |
| -o, --out | non | 処理結果を出力する場合のファイルパス |
(py37) $ python3 tryocr.py -h --- OCR on python Ver0.01 --- OpenCV version 4.5.2 usage: tryocr.py [-h] [-i IMAGE_FILE] [-l LANGUAGE] [-c CONFIDENCE] [-p PROCESS] [-d LINEDEL] [--layout LAYOUT] [--maxsize MAXSIZE] [--log LOG] [-t TITLE] [-s SPEED] [-o IMAGE_OUT] optional arguments: -h, --help show this help message and exit -i IMAGE_FILE, --image IMAGE_FILE Absolute path to image file or cam for camera stream. Default value is 'test1.png' -l LANGUAGE, --language LANGUAGE Language. Default value is 'jpn' -c CONFIDENCE, --confidence CONFIDENCE Confidence Level Default value is 40 -p PROCESS, --prosess PROCESS Preprocessing flag.(y/n) Default value is 'n' -d LINEDEL, --linedel LINEDEL Line delete flag.(y/b/n) Default value is 'n' --layout LAYOUT Tesseract layout Default value is 6 --maxsize MAXSIZE Image max size (free=0). Default value is 0 --log LOG Log output flag.(y/s/n) Default value is 'n' -t TITLE, --title TITLE Program title flag.(y/n) Default value is 'y' -s SPEED, --speed SPEED Speed display flag.(y/n) Default value is 'y' -o IMAGE_OUT, --out IMAGE_OUT Processed image file path. Default value is 'non'
# -*- coding: utf-8 -*-
##------------------------------------------
## OCR on python Ver0.01
## with tesseract & PyOCR
##
## 2021.11.14 Masahiro Izutsu
##------------------------------------------
## tryocr.py
# Color Escape Code
GREEN = '\033[1;32m'
RED = '\033[1;31m'
NOCOLOR = '\033[0m'
YELLOW = '\033[1;33m'
# 定数定義
WINDOW_MAX = 1280 # 最大表示サイズ
LINE_WORD_BOX_COLOR = (0, 0, 240)
WORD_BOX_COLOR = (255, 0, 0)
CONTENTS_COLOR = (0, 128, 0)
from os.path import expanduser
DEF_INPUT_FILE = expanduser('test1.png')
# import処理
from PIL import Image
import sys
import pyocr
import pyocr.builders
import cv2
import argparse
import mylib
import myfunction
import numpy as np
import mylib_gui
# タイトル・バージョン情報
title = 'OCR on python Ver0.01'
print(GREEN)
print('--- {} ---'.format(title))
print(' OpenCV version {} '.format(cv2.__version__))
print(NOCOLOR)
# Parses arguments for the application
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--image', metavar = 'IMAGE_FILE', type = str, default = DEF_INPUT_FILE,
help = 'Absolute path to image file or cam for camera stream. Default value is \'' + DEF_INPUT_FILE + '\'')
parser.add_argument('-l', '--language', metavar = 'LANGUAGE',
default = 'jpn',
help = 'Language. Default value is \'jpn\'')
parser.add_argument('-c', '--confidence', metavar = 'CONFIDENCE',
default = 40,
help = 'Confidence Level Default value is 40')
parser.add_argument('-p', '--prosess', metavar = 'PROCESS',
default = 'n',
help = 'Preprocessing flag.(y/n) Default value is \'n\'')
parser.add_argument('-d', '--linedel', metavar = 'LINEDEL',
default = 'n',
help = 'Line delete flag.(y/b/n) Default value is \'n\'')
parser.add_argument('--layout', metavar = 'LAYOUT',
default = 6,
help = 'Tesseract layout Default value is 6')
parser.add_argument('--maxsize', metavar = 'MAXSIZE',
default = 0,
help = 'Image max size (free=0). Default value is 0')
parser.add_argument('--log', metavar = 'LOG',
default = 'n',
help = 'Log output flag.(y/s/n) Default value is \'n\'')
parser.add_argument('-t', '--title', metavar = 'TITLE',
default = 'y',
help = 'Program title flag.(y/n) Default value is \'y\'')
parser.add_argument('-s', '--speed', metavar = 'SPEED',
default = 'y',
help = 'Speed display flag.(y/n) Default value is \'y\'')
parser.add_argument('-o', '--out', metavar = 'IMAGE_OUT',
default = 'non',
help = 'Processed image file path. Default value is \'non\'')
return parser
# モデル基本情報の表示
def display_info(image, lang, prosess, linedel, conf, layout, maxsize, log, titleflg, speedflg, outpath):
print(YELLOW + title + ': Starting application...' + NOCOLOR)
print(' - ' + YELLOW + 'Image File : ' + NOCOLOR, image)
print(' - ' + YELLOW + 'Language : ' + NOCOLOR, lang)
print(' - ' + YELLOW + 'Preprocessing: ' + NOCOLOR, prosess)
print(' - ' + YELLOW + 'Line delete : ' + NOCOLOR, linedel)
print(' - ' + YELLOW + 'Confidence : ' + NOCOLOR, conf)
print(' - ' + YELLOW + 'Layout : ' + NOCOLOR, layout)
print(' - ' + YELLOW + 'Max size : ' + NOCOLOR, maxsize)
print(' - ' + YELLOW + 'Log frag : ' + NOCOLOR, log)
print(' - ' + YELLOW + 'Program Title: ' + NOCOLOR, titleflg)
print(' - ' + YELLOW + 'Speed flag : ' + NOCOLOR, speedflg)
print(' - ' + YELLOW + 'Processed out: ' + NOCOLOR, outpath)
# 画像の種類を判別する
# 戻り値: 'jeg''png'... 画像ファイル
# 'None' 画像ファイル以外 (動画ファイル)
# 'NotFound' ファイルが存在しない
import imghdr
def is_pict(filename):
try:
imgtype = imghdr.what(filename)
except FileNotFoundError as e:
imgtype = 'NotFound'
return str(imgtype)
# 画像の前処理
def img_preproces(img):
# グレイスケール演算
im_gray = 0.299 * img[:,:,2] + 0.587 * img[:,:,1] + 0.114 * img[:,:,0]
im_gray8 = np.uint8(im_gray)
# 大津アルゴリズムでは thresh, maxvalは無視されてしきい値は自動で設定される
ret, im_gray8 = cv2.threshold(im_gray8, thresh=0, maxval=255, type=cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# すべてのチャンネル
img[:,:,0] = im_gray8
img[:,:,1] = im_gray8
img[:,:,2] = im_gray8
return img
# 罫線消去
def delete_line(img):
# 自動パラメータの計算
h, w = img.shape[:2]
thr = 100
lln = int(w/18)
if lln < 44:
lln = 44
gap = int(w/1000) + 4
print('\nThreshhold={}, MinLineLength={}, MaxLineGap={}, width={}, height={}'.format(thr, lln, gap, w, h))
imgw = img.copy()
# グレースケール
gray = cv2.cvtColor(imgw, cv2.COLOR_BGR2GRAY)
# 2値化
ret, gray = cv2.threshold(gray, thresh=0, maxval=255, type=cv2.THRESH_BINARY + cv2.THRESH_OTSU)
## 反転 ネガポジ変換
gray = cv2.bitwise_not(gray)
lines = cv2.HoughLinesP(gray, rho=1, theta=np.pi/360, threshold=thr, minLineLength=lln, maxLineGap=gap)
if lines is not None:
for line in lines:
x1, y1, x2, y2 = line[0]
# 線を消す(白で線を引く)
imgw = cv2.line(imgw, (x1,y1), (x2,y2), (255,255,255), 3)
return imgw
# ** main関数 **
def main():
# 日本語フォント指定
fontPIL = 'NotoSansCJK-Bold.ttc'
# Argument parsing and parameter setting
ARGS = parse_args().parse_args()
input_stream = ARGS.image
lang = ARGS.language
titleflg = ARGS.title
speedflg = ARGS.speed
linedel = ARGS.linedel
conf = int(ARGS.confidence)
layout = int(ARGS.layout)
maxsize = int(ARGS.maxsize)
logflg = ARGS.log
prosess = ARGS.prosess
if ARGS.image.lower() == "cam" or ARGS.image.lower() == "camera":
input_stream = 0
isstream = True
else:
filetype = is_pict(input_stream)
isstream = filetype == 'None'
if (filetype == 'NotFound'):
print(RED + "\ninput file Not found." + NOCOLOR)
quit()
outpath = ARGS.out
# 情報表示
display_info(input_stream, lang, prosess, linedel, conf, layout, maxsize, logflg, titleflg, speedflg, outpath)
# OCR
tools = pyocr.get_available_tools()
if len(tools) == 0:
print(RED + "\nOCR tool Not found." + NOCOLOR)
quit()
tool = tools[0]
# 入力準備
if (isstream):
# カメラ
cap = cv2.VideoCapture(input_stream)
ret, frame = cap.read()
if ret == False:
print(RED + "\nUnable to video camera." + NOCOLOR)
quit()
loopflg = cap.isOpened()
else:
# 画像ファイル読み込み
frame = cv2.imread(input_stream)
if frame is None:
print(RED + "\nUnable to read the input." + NOCOLOR)
quit()
if maxsize > 300:
# アスペクト比を固定してリサイズ
img_h, img_w = frame.shape[:2]
if (img_w > img_h):
if (img_w > maxsize):
height = round(img_h * (maxsize / img_w))
frame = cv2.resize(frame, dsize = (maxsize, height))
else:
if (img_h > maxsize):
width = round(img_w * (maxsize / img_h))
frame = cv2.resize(frame, dsize = (width, maxsize))
loopflg = True # 1回ループ
# メッセージ作成
h, w = frame.shape[:2]
st_pram = 'pros={}, linedel={}, conf={}, layout={} width={}, height={}'.format(prosess, linedel, conf, layout, w, h)
# 処理結果の記録 step1
if (outpath != 'non'):
if (isstream):
fps = int(cap.get(cv2.CAP_PROP_FPS))
out_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
out_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc('m', 'p', '4', 'v')
outvideo = cv2.VideoWriter(outpath, fourcc, fps, (out_w, out_h))
# 計測値初期化
fpsWithTick = mylib.fpsWithTick()
frame_count = 0
fps_total = 0
fpsWithTick.get() # fps計測開始
# メインループ
while (loopflg):
if frame is None:
print(RED + "\nUnable to read the input." + NOCOLOR)
quit()
# 画像の前処理
if (prosess == 'y'): # モノクロ・2値化 フォアグラウンド処理
frame = img_preproces(frame)
if (linedel == 'y'): # 罫線除去 フォアグラウンド処理
frame = delete_line(frame)
frame_pl = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
elif (linedel == 'b'): # 罫線除去 バックグラウンド処理
frame_pl = delete_line(frame)
frame_pl = cv2.cvtColor(frame_pl, cv2.COLOR_RGB2BGR)
else: # 罫線除去なし
frame_pl = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
# PILのイメージにする
img = Image.fromarray(frame_pl)
# 文字認識処理
line_and_word_boxes = tool.image_to_string(img, lang=lang, builder=pyocr.builders.LineBoxBuilder(tesseract_layout=layout))
for lw_box in line_and_word_boxes:
content = lw_box.content
position = lw_box.position
box = []
txt = []
n = 0
for lw_box in lw_box.word_boxes:
txt.append(lw_box.content)
box.append(lw_box.position)
n = n+1
confidence = lw_box.confidence
if confidence > conf and len(content) > 0:
xmin = position[0][0]
ymin = position[0][1]
xmax = position[1][0]
ymax = position[1][1]
cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), color=LINE_WORD_BOX_COLOR, thickness=3)
for nm in range(n):
cv2.rectangle(frame, (box[nm][0][0], box[nm][0][1]), (box[nm][1][0], box[nm][1][1]), color=WORD_BOX_COLOR, thickness=1)
st_score = '#Score{:3}: '.format(confidence) + content
myfunction.cv2_putText(img = frame,
text = st_score,
org = (xmin, ymin - 4),
fontFace = fontPIL,
fontScale = 20,
color = CONTENTS_COLOR,
mode = 0)
if (logflg == 'y') or (logflg == 's'):
print('\ncontents: ', content)
print('position: ', position)
print('confidence: ', confidence)
if (logflg == 's'):
for nm in range(n):
print(' {: <8}'.format(txt[nm]), ' ', box[nm])
# FPSを計算する
fps = fpsWithTick.get()
st_fps = 'fps: {:>6.2f}'.format(fps)
if (speedflg == 'y'):
cv2.rectangle(frame, (10, 38), (95, 55), (90, 90, 90), -1)
cv2.putText(frame, st_fps, (15, 50), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.4, color=(255, 255, 255), lineType=cv2.LINE_AA)
# タイトル描画
if (titleflg == 'y'):
cv2.putText(frame, title, (10, 30), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.8, color=(200, 200, 0), lineType=cv2.LINE_AA)
cv2.putText(frame, st_pram, (100, 50), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.5, color=(0, 0, 0), lineType=cv2.LINE_AA)
# 画像表示
window_name = title + " (hit 'q' or 'esc' key to exit)"
cv2.namedWindow(window_name, cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(window_name, frame)
# 処理結果の記録 step2
if (outpath != 'non'):
if (isstream):
outvideo.write(frame)
else:
cv2.imwrite(outpath, frame)
# 何らかのキーが押されたら終了
breakflg = False
while(True):
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
breakflg = True
break
if (isstream):
break
if not mylib_gui._is_visible(window_name): # 'Close' button
break
if ((breakflg == False) and isstream):
# 次のフレームを読み出す
ret, frame = cap.read()
if ret == False:
break
loopflg = cap.isOpened()
else:
loopflg = False
# 終了処理
if (isstream):
cap.release()
# 処理結果の記録 step3
if (outpath != 'non'):
if (isstream):
outvideo.release()
cv2.destroyAllWindows()
print('\nFPS average: {:>10.2f}'.format(fpsWithTick.get_average()))
print('\n Finished.')
# main関数エントリーポイント(実行開始)
if __name__ == "__main__":
sys.exit(main())
(py37) cd ~/workspace_py37/pyocr/ (py37) $ python3 tryocr.py --- OCR on python Ver0.01 --- OpenCV version 4.5.2 OCR on python Ver0.01: Starting application... - Image File : test1.png - Language : jpn - Preprocessing: n - Line delete : n - Confidence : 40 - Layout : 6 - Max size : 0 - Log frag : n - Program Title: y - Speed flag : y - Processed out: non FPS average: 1.70 Finished.
次のステップとして「帳票」画像からそれぞれの項目を切り出して OCRに入力し結果をラベリングするアプリケーションを作成する。
前段階としてのユーザーインターフェイスと GUIを検討する。
(py37) $ mkdir ~/workspace_py37/tryocr/ (py37) $ cd ~/workspace_py37/tryocr/ (py37) $ cp ~/workspace_py37/exercise/cvui/mouse-complex.py mouse-complex1.py
# -*- coding: utf-8 -*-
##------------------------------------------
## cvui demo Programe (mouse-complex.py)
##
## 2021.12.19 Masahiro Izutsu
##------------------------------------------
## mouse-complex1.py
#
# This is application uses the mouse API to dynamically create a ROI
# for image visualization.
#
# Copyright (c) 2018 Fernando Bevilacqua <dovyski@gmail.com>
# Licensed under the MIT license.
#
import numpy as np
import cv2
import cvui
import mylib_gui
WINDOW_NAME = 'Original Image'
ROI_WINDOW = 'Cut-out area'
def main():
lena = cv2.imread('lena.jpg')
frame = np.zeros(lena.shape, np.uint8)
anchor = cvui.Point()
roi = cvui.Rect(0, 0, 0, 0)
working = False
pos1 = 0
pos2 = 0
frame_h, frame_w = frame.shape[:2]
# Init cvui and tell it to create a OpenCV window, i.e. cv.namedWindow(WINDOW_NAME).
cv2.namedWindow(WINDOW_NAME, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cvui.init(WINDOW_NAME)
while (True):
# Fill the frame with Lena's image
frame[:] = lena[:]
# Show the coordinates of the mouse pointer on the screen
cvui.text(frame, 10, 10, 'Click (any) mouse button and drag the pointer around to select a ROI.')
# マウス・イベントの照会 cvui.mouse(cvui.DOWN)
# cvui.DOWN: マウスボタンが押された
# cvui.UP: マウスボタンを離した
# cvui.CLICK: マウスボタンのクリック.
# cvui.IS_DOWN: マウスボタンのドラッグ
if cvui.mouse(cvui.LEFT_BUTTON, cvui.DOWN):
# マウスポインタにアンカーを配置
anchor.x = cvui.mouse().x
anchor.y = cvui.mouse().y
# 作業中の通知(作業中はウインドウの更新しない)
working = True
if cvui.mouse(cvui.LEFT_BUTTON, cvui.IS_DOWN):
# 領域を設定
width = cvui.mouse().x - anchor.x
height = cvui.mouse().y - anchor.y
roi.x = anchor.x + width if width < 0 else anchor.x
roi.y = anchor.y + height if height < 0 else anchor.y
roi.width = abs(width)
roi.height = abs(height)
# 座標とサイズを表示
cvui.printf(frame, roi.x + 5, roi.y + 5, 0.3, 0xff0000, '(%d,%d)', roi.x, roi.y)
cvui.printf(frame, cvui.mouse().x + 5, cvui.mouse().y + 5, 0.3, 0xff0000, 'w:%d, h:%d', roi.width, roi.height)
if cvui.mouse(cvui.UP):
# 領域指定作業の終了
working = False
# 領域内を確認
lenaRows, lenaCols, lenaChannels = lena.shape
roi.x = 0 if roi.x < 0 else roi.x
roi.y = 0 if roi.y < 0 else roi.y
roi.width = roi.width + lena.cols - (roi.x + roi.width) if roi.x + roi.width > lenaCols else roi.width
roi.height = roi.height + lena.rows - (roi.y + roi.height) if roi.y + roi.height > lenaRows else roi.height
# 設定領域をレンダリング
cvui.rect(frame, roi.x, roi.y, roi.width, roi.height, 0xff0000)
# ウインドウの更新
cvui.update()
# 画面の表示
cv2.imshow(WINDOW_NAME, frame)
if pos1 < 10:
cv2.moveWindow(WINDOW_NAME, 0, 0)
pos1 = pos1 + 1
else:
pos1 = 10
# 設定領域の表示
if roi.area() > 0 and working == False:
lenaRoi = lena[roi.y : roi.y + roi.height, roi.x : roi.x + roi.width]
cv2.namedWindow(ROI_WINDOW, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_WINDOW, lenaRoi)
if pos2 < 10:
cv2.moveWindow(ROI_WINDOW, frame_w + 80, 0)
pos2 = pos2 + 1
else:
pos2 = 10
key = cv2.waitKey(20)
if key == 27 or key == 113:
break;
if not mylib_gui._is_visible(WINDOW_NAME):
break
cv2.destroyAllWindows()
print('\n Finished.')
if __name__ == '__main__':
main()
(py37) $ cd ~/workspace_py37/tryocr (py37) $ python3 tryocr_step1.py --- TryOCR Test Program Step-1 --- OpenCV version 4.5.2 TryOCR Test Program Step-1: Starting application... - Image File : sample0.png - Language : jpn - Layout : 6 - Max size : 1000 - Program Title: y ----------- NSホールディングス株式会社 三菱UFJ銀行 吹晶支店 ----------- Finished.
# -*- coding: utf-8 -*-
##------------------------------------------
## TryOCR Test Programe Step-1
## with tesseract & PyOCR & cvui
##
## 2021.12.19 Masahiro Izutsu
##------------------------------------------
## tryocr_step1.py
# Color Escape Code
GREEN = '\033[1;32m'
RED = '\033[1;31m'
NOCOLOR = '\033[0m'
YELLOW = '\033[1;33m'
# 定数定義
LINE_WORD_BOX_COLOR = (0, 0, 240)
WORD_BOX_COLOR = (255, 0, 0)
CONTENTS_COLOR = (0, 128, 0)
from os.path import expanduser
DEF_INPUT_FILE = expanduser('sample0.png')
# import処理
from PIL import Image
import sys
import pyocr
import pyocr.builders
import cv2
import cvui
import argparse
import myfunction
import numpy as np
import mylib_gui
import mylib_pros
# タイトル・バージョン情報
title = 'TryOCR Test Program Step-1'
print(GREEN)
print('--- {} ---'.format(title))
print(' OpenCV version {} '.format(cv2.__version__))
print(NOCOLOR)
# Parses arguments for the application
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--image', metavar = 'IMAGE_FILE', type = str, default = DEF_INPUT_FILE,
help = 'Absolute path to image file. Default value is \'' + DEF_INPUT_FILE + '\'')
parser.add_argument('-l', '--language', metavar = 'LANGUAGE',
default = 'jpn',
help = 'Language. Default value is \'jpn\'')
parser.add_argument('--layout', metavar = 'LAYOUT',
default = 6,
help = 'Tesseract layout Default value is 6')
parser.add_argument('--maxsize', metavar = 'MAXSIZE',
default = 1000,
help = 'Image max size (free=0). Default value is 1000')
parser.add_argument('-t', '--title', metavar = 'TITLE',
default = 'y',
help = 'Program title flag.(y/n) Default value is \'y\'')
return parser
# モデル基本情報の表示
def display_info(image, lang, layout, maxsize, titleflg):
print(YELLOW + title + ': Starting application...' + NOCOLOR)
print(' - ' + YELLOW + 'Image File : ' + NOCOLOR, image)
print(' - ' + YELLOW + 'Language : ' + NOCOLOR, lang)
print(' - ' + YELLOW + 'Layout : ' + NOCOLOR, layout)
print(' - ' + YELLOW + 'Max size : ' + NOCOLOR, maxsize)
print(' - ' + YELLOW + 'Program Title: ' + NOCOLOR, titleflg)
def frame_resize(image, maxsize):
if maxsize > 300:
# アスペクト比を固定してリサイズ
img_h, img_w = image.shape[:2]
if (img_w > img_h):
if (img_w > maxsize):
height = round(img_h * (maxsize / img_w))
image = cv2.resize(image, dsize = (maxsize, height))
else:
if (img_h > maxsize):
width = round(img_w * (maxsize / img_h))
image = cv2.resize(image, dsize = (width, maxsize))
return image
WINDOW_NAME = title
ROI_WINDOW = 'Cut-out area'
ROI_POPUP = 'OCR detection result Text'
# ** main関数 **
def main():
# 日本語フォント指定
fontPIL = 'NotoSansCJK-Bold.ttc'
# Argument parsing and parameter setting
ARGS = parse_args().parse_args()
input_stream = ARGS.image
lang = ARGS.language
layout = int(ARGS.layout)
maxsize = int(ARGS.maxsize)
titleflg = ARGS.title
# 情報表示
display_info(input_stream, lang, layout, maxsize, titleflg)
# OCR
tools = pyocr.get_available_tools()
if len(tools) == 0:
print(RED + "\nOCR tool Not found." + NOCOLOR)
quit()
tool = tools[0]
# OpenCV でイメージを読む
lena_frame = cv2.imread(input_stream)
if lena_frame is None:
print(RED + "\nUnable to read the input." + NOCOLOR)
quit()
lena_frame = mylib_pros.frame_resize(lena_frame, maxsize)
frame = np.zeros(lena_frame.shape, np.uint8)
popup_frame = np.zeros((120, 500, 3), np.uint8)
anchor = cvui.Point()
roi = cvui.Rect(0, 0, 0, 0)
working = False
frame_h, frame_w = frame.shape[:2]
outf = False
print('\n -----------')
# Init cvui and tell it to create a OpenCV window, i.e. cv.namedWindow(WINDOW_NAME).
cv2.namedWindow(WINDOW_NAME, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cvui.init(WINDOW_NAME)
while (True):
# Fill the frame with Lena's image
frame[:] = lena_frame[:]
# Show the coordinates of the mouse pointer on the screen
cvui.text(frame, 10, 10, 'Click mouse left-button and drag the pointer around to select a cut area.')
# マウス・イベント
if cvui.mouse(cvui.LEFT_BUTTON, cvui.DOWN):
# マウスポインタにアンカーを配置
anchor.x = cvui.mouse().x
anchor.y = cvui.mouse().y
# 作業中の通知(作業中はウインドウの更新しない)
working = True
if cvui.mouse(cvui.LEFT_BUTTON, cvui.IS_DOWN):
# 領域を設定
width = cvui.mouse().x - anchor.x
height = cvui.mouse().y - anchor.y
roi.x = anchor.x + width if width < 0 else anchor.x
roi.y = anchor.y + height if height < 0 else anchor.y
roi.width = abs(width)
roi.height = abs(height)
# 座標とサイズを表示
cvui.printf(frame, roi.x + 5, roi.y + 5, 0.3, 0xff0000, '(%d,%d)', roi.x, roi.y)
cvui.printf(frame, cvui.mouse().x + 5, cvui.mouse().y + 5, 0.3, 0xff0000, 'w:%d, h:%d', roi.width, roi.height)
if cvui.mouse(cvui.UP):
# 領域指定作業の終了
working = False
outf = True
# 領域内を確認
lenaRows, lenaCols, lenaChannels = lena_frame.shape
roi.x = 0 if roi.x < 0 else roi.x
roi.y = 0 if roi.y < 0 else roi.y
roi.width = roi.width + lena_frame.cols - (roi.x + roi.width) if roi.x + roi.width > lenaCols else roi.width
roi.height = roi.height + lena_frame.rows - (roi.y + roi.height) if roi.y + roi.height > lenaRows else roi.height
# 設定領域をレンダリング
cvui.rect(frame, roi.x, roi.y, roi.width, roi.height, 0xff0000)
# タイトル描画
if (titleflg == 'y'):
cv2.putText(frame, title, (10, 30), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.8, color=(200, 200, 0), lineType=cv2.LINE_AA)
# ウインドウの更新
cvui.update()
# 画面の表示
cv2.imshow(WINDOW_NAME, frame)
cv2.moveWindow(WINDOW_NAME, 80, 0)
# 設定領域の表示
if roi.area() > 0 and working == False:
lenaRoi = lena_frame[roi.y : roi.y + roi.height, roi.x : roi.x + roi.width]
cv2.namedWindow(ROI_WINDOW, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_WINDOW, lenaRoi)
cv2.moveWindow(ROI_WINDOW, frame_w + 100, 0)
lenaRoi_h, lenaRoi_w = lenaRoi.shape[:2]
# 切り出した領域を OCR
# PILのイメージにする
lenaRoi1 = cv2.cvtColor(lenaRoi, cv2.COLOR_RGB2BGR)
imgRoi = Image.fromarray(lenaRoi1)
# txt is a Python string
txt = tool.image_to_string(imgRoi, lang=lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=layout))
# テキストの描画
popup_frame[:,:,:] = 0
cv2.rectangle(popup_frame, (0, 88), (500, 105), (255,0,0), -1)
myfunction.cv2_putText(img = popup_frame,
text = txt,
org = (15, 104),
fontFace = fontPIL,
fontScale = 12,
color = (255,255,255),
mode = 0)
cv2.namedWindow(ROI_POPUP, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_POPUP, popup_frame)
cv2.moveWindow(ROI_POPUP, frame_w + 100, lenaRoi_h + 100)
if outf:
print(' ', txt)
outf = False
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
break
if not mylib_gui._is_visible(title): # 'Close' button
break
cv2.destroyAllWindows()
print(' -----------\n Finished.')
# main関数エントリーポイント(実行開始)
if __name__ == "__main__":
sys.exit(main())
# -*- coding: utf-8 -*- ##------------------------------------------ ## My Library process with OpenCV ## ## 2021.12.19 Masahiro Izutsu ##------------------------------------------ ## mylib_pros.py import cv2 # アスペクト比を固定して画像をリサイズ def frame_resize(image, maxsize): if maxsize > 300: img_h, img_w = image.shape[:2] if (img_w > img_h): if (img_w > maxsize): height = round(img_h * (maxsize / img_w)) image = cv2.resize(image, dsize = (maxsize, height)) else: if (img_h > maxsize): width = round(img_w * (maxsize / img_h)) image = cv2.resize(image, dsize = (width, maxsize)) return image
(py37) $ cd ~/workspace_py37/tryocr (py37) $ python3 tryocr_step2.py -i ../pyocr/sample2.png --- TryOCR Test Program Step-2 --- OpenCV version 4.5.2 TryOCR Test Program Step-2: Starting application... - Image File : ../pyocr/sample2.png - Language : jpn - Layout : 6 - Max size : 1000 - Program Title: y original w x h : 7017 x 4958 display w x h : 1000 x 707 scale w x h : 0.143 x 0.143 ----------- NSホールディングス株式会社 <area> (252, 772) - (2027, 947) 三菱UFJ銀行 吹田支店 <area> (421, 5249) - (1550, 5403) ----------- Finished.
# -*- coding: utf-8 -*-
##------------------------------------------
## TryOCR Test Programe Step-2
## with tesseract & PyOCR & cvui
##
## 2021.12.19 Masahiro Izutsu
##------------------------------------------
## tryocr_step2.py
# Color Escape Code
GREEN = '\033[1;32m'
RED = '\033[1;31m'
NOCOLOR = '\033[0m'
YELLOW = '\033[1;33m'
# 定数定義
LINE_WORD_BOX_COLOR = (0, 0, 240)
WORD_BOX_COLOR = (255, 0, 0)
CONTENTS_COLOR = (0, 128, 0)
from os.path import expanduser
DEF_INPUT_FILE = expanduser('sample0.png')
# import処理
from PIL import Image
import sys
import pyocr
import pyocr.builders
import cv2
import cvui
import argparse
import myfunction
import numpy as np
import mylib_gui
import mylib_frame
# タイトル・バージョン情報
title = 'TryOCR Test Program Step-2'
print(GREEN)
print('--- {} ---'.format(title))
print(' OpenCV version {} '.format(cv2.__version__))
print(NOCOLOR)
# Parses arguments for the application
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--image', metavar = 'IMAGE_FILE', type = str, default = DEF_INPUT_FILE,
help = 'Absolute path to image file. Default value is \'' + DEF_INPUT_FILE + '\'')
parser.add_argument('-l', '--language', metavar = 'LANGUAGE',
default = 'jpn',
help = 'Language. Default value is \'jpn\'')
parser.add_argument('--layout', metavar = 'LAYOUT',
default = 6,
help = 'Tesseract layout Default value is 6')
parser.add_argument('--maxsize', metavar = 'MAXSIZE',
default = 1000,
help = 'Image max size (free=0). Default value is 1000')
parser.add_argument('-t', '--title', metavar = 'TITLE',
default = 'y',
help = 'Program title flag.(y/n) Default value is \'y\'')
return parser
# モデル基本情報の表示
def display_info(image, lang, layout, maxsize, titleflg):
print(YELLOW + title + ': Starting application...' + NOCOLOR)
print(' - ' + YELLOW + 'Image File : ' + NOCOLOR, image)
print(' - ' + YELLOW + 'Language : ' + NOCOLOR, lang)
print(' - ' + YELLOW + 'Layout : ' + NOCOLOR, layout)
print(' - ' + YELLOW + 'Max size : ' + NOCOLOR, maxsize)
print(' - ' + YELLOW + 'Program Title: ' + NOCOLOR, titleflg)
def frame_resize(image, maxsize):
if maxsize > 300:
# アスペクト比を固定してリサイズ
img_h, img_w = image.shape[:2]
if (img_w > img_h):
if (img_w > maxsize):
height = round(img_h * (maxsize / img_w))
image = cv2.resize(image, dsize = (maxsize, height))
else:
if (img_h > maxsize):
width = round(img_w * (maxsize / img_h))
image = cv2.resize(image, dsize = (width, maxsize))
return image
WINDOW_NAME = title
ROI_WINDOW = 'Cut-out area'
ROI_POPUP = 'OCR detection result Text'
# ** main関数 **
def main():
# 日本語フォント指定
fontPIL = 'NotoSansCJK-Bold.ttc'
# Argument parsing and parameter setting
ARGS = parse_args().parse_args()
input_stream = ARGS.image
lang = ARGS.language
layout = int(ARGS.layout)
maxsize = int(ARGS.maxsize)
titleflg = ARGS.title
# 情報表示
display_info(input_stream, lang, layout, maxsize, titleflg)
# OCR
tools = pyocr.get_available_tools()
if len(tools) == 0:
print(RED + "\nOCR tool Not found." + NOCOLOR)
quit()
tool = tools[0]
# OpenCV でイメージを読む
lena_frame_org = cv2.imread(input_stream)
if lena_frame_org is None:
print(RED + "\nUnable to read the input." + NOCOLOR)
quit()
# mylib_frame ライブラリ
imgfr = mylib_frame.ImageFrame(lena_frame_org) # 初期化
imgfr.set_screen_size(1680, 1050)
lena_frame = imgfr.frame_resize(maxsize)
frame = np.zeros(lena_frame.shape, np.uint8)
popup_frame = np.zeros((120, 500, 3), np.uint8)
anchor = cvui.Point()
roi = cvui.Rect(0, 0, 0, 0)
working = False
frame_h, frame_w = frame.shape[:2]
outf = False
org_h, org_w = imgfr.get_original_size()
scale_h, scale_w = imgfr.get_scale()
print('\n original w x h : {:=5} x {:=5}'.format(org_h, org_w))
print(' display w x h : {:=5} x {:=5}'.format(frame_h, frame_w))
print(' scale w x h : {:.3f} x {:.3f}'.format(scale_h, scale_w))
print(' -----------')
# Init cvui and tell it to create a OpenCV window, i.e. cv.namedWindow(WINDOW_NAME).
cv2.namedWindow(WINDOW_NAME, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cvui.init(WINDOW_NAME)
while (True):
# Fill the frame with Lena's image
frame[:] = lena_frame[:]
# Show the coordinates of the mouse pointer on the screen
cvui.text(frame, 10, 10, 'Click mouse left-button and drag the pointer around to select a cut area.')
# マウス・イベント
if cvui.mouse(cvui.LEFT_BUTTON, cvui.DOWN):
# マウスポインタにアンカーを配置
anchor.x = cvui.mouse().x
anchor.y = cvui.mouse().y
# 作業中の通知(作業中はウインドウの更新しない)
working = True
if cvui.mouse(cvui.LEFT_BUTTON, cvui.IS_DOWN):
# 領域を設定
width = cvui.mouse().x - anchor.x
height = cvui.mouse().y - anchor.y
roi.x = anchor.x + width if width < 0 else anchor.x
roi.y = anchor.y + height if height < 0 else anchor.y
roi.width = abs(width)
roi.height = abs(height)
# 座標とサイズを表示
cvui.printf(frame, roi.x + 5, roi.y + 5, 0.3, 0xff0000, '(%d,%d)', roi.x, roi.y)
cvui.printf(frame, cvui.mouse().x + 5, cvui.mouse().y + 5, 0.3, 0xff0000, 'w:%d, h:%d', roi.width, roi.height)
if cvui.mouse(cvui.UP):
# 領域指定作業の終了
working = False
outf = True
# 領域内を確認
lenaRows, lenaCols, lenaChannels = lena_frame.shape
roi.x = 0 if roi.x < 0 else roi.x
roi.y = 0 if roi.y < 0 else roi.y
roi.width = roi.width + lena_frame.cols - (roi.x + roi.width) if roi.x + roi.width > lenaCols else roi.width
roi.height = roi.height + lena_frame.rows - (roi.y + roi.height) if roi.y + roi.height > lenaRows else roi.height
# 設定領域をレンダリング
cvui.rect(frame, roi.x, roi.y, roi.width, roi.height, 0xff0000)
# タイトル描画
if (titleflg == 'y'):
cv2.putText(frame, title, (10, 30), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.8, color=(200, 200, 0), lineType=cv2.LINE_AA)
# ウインドウの更新
cvui.update()
# 画面の表示
cv2.imshow(WINDOW_NAME, frame)
cv2.moveWindow(WINDOW_NAME, 80, 0)
# 得られた表示座標から元画像の位置を計算して画像を切り出す
if roi.area() > 50 and working == False:
x0, y0 = imgfr.get_res2org_xy(roi.x, roi.y)
x1, y1 = imgfr.get_res2org_xy(roi.x + roi.width, roi.y + roi.height)
lenaRoi = lena_frame_org[y0 : y1, x0 : x1]
cv2.namedWindow(ROI_WINDOW, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_WINDOW, lenaRoi)
cv2.moveWindow(ROI_WINDOW, frame_w + 100, 0)
lenaRoi_h, lenaRoi_w = lenaRoi.shape[:2]
# 切り出した領域を OCR
# PILのイメージにする
lenaRoi1 = cv2.cvtColor(lenaRoi, cv2.COLOR_RGB2BGR)
imgRoi = Image.fromarray(lenaRoi1)
# txt is a Python string
txt = tool.image_to_string(imgRoi, lang=lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=layout))
# テキストの描画
if len(txt)>0:
popup_frame[:,:,:] = 0
cv2.rectangle(popup_frame, (0, 88), (500, 105), (255,0,0), -1)
myfunction.cv2_putText(img = popup_frame,
text = txt,
org = (15, 104),
fontFace = fontPIL,
fontScale = 12,
color = (255,255,255),
mode = 0)
cv2.namedWindow(ROI_POPUP, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_POPUP, popup_frame)
cv2.moveWindow(ROI_POPUP, frame_w + 100, lenaRoi_h + 100)
if outf:
print(' ', txt)
print(' <area> ({}, {}) - ({}, {})'.format(x0, y0, x1, y1))
outf = False
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
break
if not mylib_gui._is_visible(title): # 'Close' button
break
cv2.destroyAllWindows()
print(' -----------\n Finished.')
# main関数エントリーポイント(実行開始)
if __name__ == "__main__":
sys.exit(main())
# -*- coding: utf-8 -*-
##------------------------------------------
## My Library image frame with OpenCV
##
## 2021.12.20 Masahiro Izutsu
##------------------------------------------
## mylib_frame.py
import cv2
import numpy as np
class ImageFrame:
max_min = 300 # 画像をリサイズする最小ピクセル数
frame_org = None # 元画像イメージ
frame_res = None # リサイズされた画像イメージ
org_h = 0 # 元画像の高さ
org_w = 0 # 幅
res_h = 0 # リサイズ画像の高さ
res_w = 0 # 幅
scale_h = 0.0 # リサイズ画像の比率 高さ
scale_w = 0.0 # 幅
screen_h = 0 # 表示ウインドウの最大高さ
screen_w = 0 # 幅
def __init__(self, img, max_min = 300):
self.frame_org = img
self.max_min = max_min
self.org_h, self.org_w = img.shape[:2]
def set_screen_size(self, height, width):
self.screen_h = height
self.screen_w = width
# アスペクト比を固定して画像をリサイズ
# boxl: 大きさ (-1=画面最大, 0=そのまま, max_min> boxlに収まるサイズ
# zoomf: 指定より小さい画像の拡大フラグ
def frame_resize(self, boxl = 0, zoomf = False):
if self.frame_org is not None:
if boxl == -1:
maxsize = self.screen_h if self.screen_h < self.screen_w else self.screen_w
else:
maxsize = boxl
if maxsize > self.max_min:
if self.org_w > self.org_h: # 横長(landscape)
if self.org_w > maxsize:
self.res_h = round(self.org_h * (maxsize / self.org_w))
self.res_w = maxsize
elif zoomf: # 指定より小さい画像を拡大する
self.res_h = round(self.org_h * (maxsize / self.org_w))
self.res_w = maxsize
else: # 指定より小さい画像のまま
self.res_h = self.org_h
self.res_w = self.org_w
else: # 縦長(portrait)
if self.org_h > maxsize:
self.res_h = maxsize
self.res_w = round(self.org_w * (maxsize / self.org_h))
elif zoomf: # 指定より小さい画像を拡大する
self.res_h = maxsize
self.res_w = round(self.org_w * (maxsize / self.org_h))
else: # 指定より小さい画像のまま
self.res_h = self.org_h
self.res_w = self.org_w
self.frame_res = cv2.resize(self.frame_org, dsize = (self.res_w, self.res_h))
else:
self.frame_res = self.frame_org.copy()
self.res_h, self.res_w = self.frame_res.shape[:2]
self.scale_h = self.res_h / self.org_h
self.scale_w = self.res_w / self.org_w
return self.frame_res
def get_screen_size(self):
return self.screen_h, self.screen_w
def get_scale(self):
return self.scale_h, self.scale_w
def get_original_size(self):
return self.org_h, self.org_w
def get_resize_size(self):
return self.res_h, self.res_w
def get_res2org_xy(self, rx, ry):
if self.scale_h > 0 and self.scale_w > 0:
ox = round(rx / self.scale_w)
oy = round(ry / self.scale_h)
else:
ox = 0
oy = 0
return ox, oy
def get_org2res_xy(self, ox, oy):
if self.scale_h > 0 and self.scale_w > 0:
rx = round(ox * self.scale_w)
ry = round(oy * self.scale_h)
else:
rx = 0
ry = 0
return rx, ry
# テストルーチン
if __name__ == "__main__":
window_name = 'ImageFrame class'
# image = np.zeros((2000, 4000, 3), np.uint8)
# image = np.zeros((2000, 1200, 3), np.uint8)
# image = np.zeros((1500, 1000, 3), np.uint8)
# image = np.zeros((1000, 1500, 3), np.uint8)
image = np.zeros((500, 1000, 3), np.uint8)
# image = np.zeros((1000, 500, 3), np.uint8)
imgfr = ImageFrame(image) # 初期化
imgfr.set_screen_size(1680, 1050)
image_res = imgfr.frame_resize(-1, True)
cv2.namedWindow(window_name, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(window_name, image_res)
screen_h, screen_w = imgfr.get_screen_size()
print('screen_h; ', screen_h, ' screen_w: ', screen_w)
scale_h, scale_w = imgfr.get_scale()
print('scale_h; ', scale_h, ' scale_w: ', scale_w)
org_h, org_w = imgfr.get_original_size()
print('org_h; ', org_h, ' org_w: ', org_w)
res_h, res_w = imgfr.get_resize_size()
print('res_h; ', res_h, ' res_w: ', res_w)
ox, oy = imgfr.get_res2org_xy(400, 500)
print('ox: ', ox, ' oy: ', oy)
rx, ry = imgfr.get_org2res_xy(500, 500)
print('rx: ', rx, ' ry: ', ry)
while(True):
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
break
cv2.destroyAllWindows()
文字認識エンジンのための画像処理 で作成した処理プログラムをパッケージ化してアプリケーションに組み込みできるようにする。
(py37) $ cd ~/workspace_py37/tryocr (py37) $ python3 mylib_preprocess.py Threshhold=100, MinLineLength=44, MaxLineGap=4, width=527, height=401 Threshhold=100, MinLineLength=58, MaxLineGap=5, width=1060, height=596 -20° 〜 +20° Horizontal Line count= 0, degree= 2.0454084888872277 count= 1, degree= 1.5481576989779677 count= 2, degree= 3.1798301198642345 count= 3, degree= -6.34019174590991 count= 4, degree= 1.5481576989779677 count= 5, degree= 4.763641690726178 count= 6, degree= 11.309932474020213 count= 7, degree= 9.462322208025617 count= 8, degree= -11.309932474020213 count= 9, degree= -11.309932474020213 count= 10, degree= -15.945395900922854 count= 11, degree= 18.43494882292201 count= 12, degree= 10.007979801441337 count= 13, degree= 14.036243467926479 count= 14, degree= 1.7357045889283889 count= 15, degree= -7.125016348901798 count= 16, degree= 8.13010235415598 count= 17, degree= 1.9091524329963763 count= 18, degree= 11.309932474020213 count= 19, degree= 3.3664606634298013 count= 20, degree= 10.304846468766033 count= 21, degree= -8.13010235415598 count= 22, degree= -6.34019174590991 Rotete degree: 2.025741670009789 Input range: min= 320, max= 50 H area: min1= 0, max1= 25, min2= 160, max2= 179 S area: min= 13, max= 255 V area: min= 128, max= 255
# -*- coding: utf-8 -*-
##------------------------------------------
## My Library image preprocessing with OpenCV
##
## 2021.12.20 Masahiro Izutsu
##------------------------------------------
## mylib_preprocess.py
import cv2
import numpy as np
import math
class ImagePreprocess:
logf = False
def __init__(self, flg = False):
self.logf = flg
## カラー原画 → グレイスケール → 白黒 2値画像
def img_binarization(self, img):
imgw = img.copy()
# グレイスケール演算
im_gray = 0.299 * img[:,:,2] + 0.587 * img[:,:,1] + 0.114 * img[:,:,0]
im_gray8 = np.uint8(im_gray)
# 大津アルゴリズムでは thresh, maxvalは無視されてしきい値は自動で設定される
ret, im_gray8 = cv2.threshold(im_gray8, thresh=0, maxval=255, type=cv2.THRESH_BINARY + cv2.THRESH_OTSU)
# すべてのチャンネル
imgw[:,:,0] = im_gray8
imgw[:,:,1] = im_gray8
imgw[:,:,2] = im_gray8
return imgw
## 罫線消去
def delete_line(self, img):
imgw = img.copy()
# グレースケール
gray = cv2.cvtColor(imgw, cv2.COLOR_BGR2GRAY)
# 2値化
ret, gray = cv2.threshold(gray, thresh=0, maxval=255, type=cv2.THRESH_BINARY + cv2.THRESH_OTSU)
## 反転 ネガポジ変換
gray = cv2.bitwise_not(gray)
# ハフライン検出自動パラメータの計算
thr, lln, gap = self.get_autohoughLine(img)
lines = cv2.HoughLinesP(gray, rho=1, theta=np.pi/360, threshold=thr, minLineLength=lln, maxLineGap=gap)
if lines is not None:
for line in lines:
x1, y1, x2, y2 = line[0]
# 線を消す(白で線を引く)
imgw = cv2.line(imgw, (x1,y1), (x2,y2), (255,255,255), 3)
return imgw
## 画像の回転(OpenCV)
def rotate_img(self, img, angle, boder = (255, 255, 255)):
# 画像サイズ(横, 縦)から中心座標を求める
size = tuple([img.shape[1], img.shape[0]])
center = tuple([size[0] // 2, size[1] // 2])
# 回転の変換行列を求める(画像の中心, 回転角度, 拡大率)
mat = cv2.getRotationMatrix2D(center, angle, scale=1.0)
# アフィン変換(画像, 変換行列, 出力サイズ, 補完アルゴリズム)
rot_img = cv2.warpAffine(img, mat, size, flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_CONSTANT, borderValue=boder)
return rot_img
# 色の検出 (HSV)
# 入力: mode 変換モード(0= 指定領域, 1= 指定外領域, 2= 指定領域を白に
# hmin,hmax H領域 (0 - 359°)
# smin,smax S領域 (0 - 100%)
# vmin,vmax V領域 (0 - 100%)
def detect_color(self, img, hmin, hmax, smin, smax, vmin, vmax, mode):
hmin1, hmax1, hmin2, hmax2 = self.calc_hue_area(hmin, hmax)
# 0-100% >> 0-255
smin = int(round(smin * 255 / 100))
smax = int(round(smax * 255 / 100))
vmin = int(round(vmin * 255 / 100))
vmax = int(round(vmax * 255 / 100))
if self.logf:
print('\n Input range: min= {}, max= {}'.format(hmin, hmax))
print(' H area: min1= {}, max1= {}, min2= {}, max2= {}'.format(hmin1, hmax1, hmin2, hmax2))
print(' S area: min= {}, max= {}'.format(smin, smax))
print(' V area: min= {}, max= {}'.format(vmin, vmax))
# HSV色空間に変換
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
# HSVの値域 1
hsv_min = np.array([hmin1,smin,vmin])
hsv_max = np.array([hmax1,smax,vmax])
mask = cv2.inRange(hsv, hsv_min, hsv_max)
# HSVの値域 2
if hmin2 < 180 and hmax2 < 180:
hsv_min = np.array([hmin2,smin,vmin])
hsv_max = np.array([hmax2,smax,vmax])
mask2 = cv2.inRange(hsv, hsv_min, hsv_max)
mask = mask + mask2
# mask 領域のマスク (255:指定、0:指定以外)
# mask_inv 領域のマスク反転 (0:指定、255:指定以外)
mask_inv = cv2.bitwise_not(mask)
if mode == 0:
msk = mask
else:
msk = mask_inv
# マスキング処理
masked_img = cv2.bitwise_and(img, img, mask=msk)
if mode == 2:
# 特定の色を別の色に置換する
before_color = [0, 0, 0]
after_color = [255, 255, 255]
masked_img[np.where((masked_img == before_color).all(axis=2))] = after_color
return mask, masked_img
## イメージ処理の実行
## 入力: process_mode
## 1= カラー原画 → グレイスケール → 白黒 2値画像
## 2= 罫線消去
## 3= 画像の傾き補正
## 4= 印影除去
def image_processing_execution(self, img, process_mode):
img1 = None
if process_mode == 0:
# no operation
img1 = img
if process_mode == 1:
# カラー原画 → グレイスケール → 白黒 2値画像
img1 = self.img_binarization(img)
elif process_mode == 2:
# 罫線消去
img1 = self.delete_line(img)
elif process_mode == 3:
# 画像の傾き補正
thr, lln, gap = self.get_autohoughLine(img)
arg = self.get_degree(img, thr, lln, gap)
img1 = self.rotate_img(img, arg)
if self.logf:
print('\n Rotete degree: ', arg)
elif process_mode == 4:
# 印影除去
mask, img1 = self.detect_color(img, 320, 50, 5, 100, 50, 100, 2)
return img1
##---------------
### ハフライン検出のための自動パラメータの計算 (画像サイズの違いによる)
### 戻り値: threshhold, minLineLength, maxLineGap
def get_autohoughLine(self, img, logf = True):
h, w = img.shape[:2]
thr = 100
lln = int(w/18)
if lln < 44:
lln = 44
gap = int(w/1000) + 4
if self.logf:
print('\n Threshhold={}, MinLineLength={}, MaxLineGap={}, width={}, height={}'.format(thr, lln, gap, w, h))
return thr, lln, gap
### 二点間の距離を求める
### 戻り値: distance
def get_distance(self, x1, y1, x2, y2):
d = math.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)
return d
### 画像の傾き検出
### 戻り値: degree (水平からの傾き角度/座標系の違いからこの値が補正すべき角度になる)
def get_degree(self, img, thr, lln, gap):
l_img = img.copy()
img0 = img.copy()
gray_image = cv2.cvtColor(l_img, cv2.COLOR_BGR2GRAY)
edges = cv2.Canny(gray_image,50,150,apertureSize = 3)
lines = cv2.HoughLinesP(edges, 1, np.pi/180, thr, lln, gap)
if self.logf:
print('\n -20° 〜 +20° Horizontal Line')
sum_arg = 0;
count = 0;
for line in lines:
for x1,y1,x2,y2 in line:
# 直線を描画する。
cv2.line(img0, (x1, y1), (x2, y2), (0, 0, 255), 2)
arg = math.degrees(math.atan2((y2-y1), (x2-x1)))
HORIZONTAL = 0
DIFF = 20 # 許容誤差 -> -20 - +20 を本来の水平線と考える
#arg != 0を条件に追加し、傾きの平均を0に寄りにくくした。
if arg != 0 and arg > HORIZONTAL - DIFF and arg < HORIZONTAL + DIFF :
if self.logf:
print(' count= {}, degree= {}'.format(count, arg))
sum_arg += arg;
count += 1
# 処理結果
if count == 0:
return HORIZONTAL
else:
return (sum_arg / count);
### 角度領域の計算
### 入力: 0 〜 360
### 戻り値: 0 〜 179 OpenCV領域 min,max=180 エラー
def calc_hue_area(self, hmin, hmax):
h_min1 = 360
h_max1 = 360
h_min2 = 360
h_max2 = 360
if hmin >= 0 and hmin <=360 and hmax >= 0 and hmax <= 360:
if hmin < hmax:
h_min1 = hmin
h_max1 = hmax
else:
h_min1 = 0
h_max1 = hmax
h_min2 = hmin
h_max2 = 360
# OpenCV の範囲に変換
h_min1 = int(h_min1/2)
h_max1 = int(h_max1/2)
h_min2 = int(h_min2/2)
h_max2 = int(h_max2/2)
if h_max2 > h_min2 and h_max2 ==180:
h_max2 = 179
return h_min1, h_max1, h_min2, h_max2
##---------------
# テストルーチン
# キー入力
# 1: カラー原画 → グレイスケール → 白黒 2値画像
# 2: 罫線消去
# 3: 画像の傾き補正
# 4: 印影除去
# q, Esc: 終了
def main(testmode):
window_name = 'Input Image'
window_name1 = 'Processing Image'
testfile = ['None','MIF202101-top_s.jpg','calendar.png', 'tilt_sample.png', 'stamp2.png']
img1 = None
imgpros = ImagePreprocess(True) # 初期化
img = cv2.imread('images/' + testfile[testmode])
if img is None:
print(' Unable to read the input.')
quit()
cv2.namedWindow(window_name, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(window_name, img)
cv2.moveWindow(window_name, 100, 0)
img1 = imgpros.image_processing_execution(img, testmode)
if img1 is not None:
cv2.namedWindow(window_name1, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(window_name1, img1)
cv2.moveWindow(window_name1, 100 + img.shape[1], 0)
while(True):
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
break
elif key >= ord('1') and key <= ord('4'): # テストモード変更
key = key - ord('0')
break
cv2.destroyAllWindows()
return key
if __name__ == "__main__":
loopf = True
mode = 1
while loopf:
mode = main(mode)
if mode == 27 or mode == 113:
loopf = False
| キー | 機能 |
| 0 | 前処理設定のクリア(前処理なし) |
| 1 | 白黒2値 |
| 2 | 罫線消去 |
| 3 | 罫線消去,白黒2値 |
| 4 | 印影消去 |
| 5 | 印影消去,白黒2値 |
| 6 | 印影消去,罫線消去 |
| 7 | 印影消去,罫線消去,白黒2値 |
| q,Esc | この画像を終了して別の画像の選択 |
(py37) $ cd ~/workspace_py37/tryocr (py37) $ python3 tryocr_step3.py --- TryOCR Test Program Step-3 --- OpenCV version 4.5.2 TryOCR Test Program Step-3: Starting application... - Image File : images/sample0.png - Language : jpn - Layout : 6 - Program Title: y - Log flag : y file: <images/sample0.png> Screen size: width x height = 2560 x 1335 (pixels) original w x h : 1754 x 1240 display w x h : 1285 x 908 scale w x h : 0.733 x 0.732 ----------- NSホールディングス株式会社 preprocess: 0 area: (72, 188) - (497, 240) 合計金額 \8,500 preprocess: 0 area: (85, 665) - (485, 703) PC4-25600(DDR4-3200) 16GB SO-DIMM preprocess: 0 area: (115, 782) - (594, 823) ----------- Finished.
# -*- coding: utf-8 -*-
##------------------------------------------
## TryOCR Test Programe Step-3
## with tesseract & PyOCR & cvui
##
## 2021.12.19 Masahiro Izutsu
##------------------------------------------
## tryocr_step3.py
## 前処理: '白黒2値', '罫線消去', '印影消去'
# Color Escape Code
GREEN = '\033[1;32m'
RED = '\033[1;31m'
NOCOLOR = '\033[0m'
YELLOW = '\033[1;33m'
# 定数定義
LINE_WORD_BOX_COLOR = (0, 0, 240)
WORD_BOX_COLOR = (255, 0, 0)
CONTENTS_COLOR = (0, 128, 0)
from os.path import expanduser
DEF_INPUT_FILE = expanduser('images/sample0.png')
# import処理
from PIL import Image
import sys
import pyocr
import pyocr.builders
import cv2
import cvui
import argparse
import myfunction
import numpy as np
import mylib_gui
import mylib_frame
import mylib_preprocess
import mylib_screen
from tkinter import filedialog
# タイトル・バージョン情報
title = 'TryOCR Test Program Step-3'
print(GREEN)
print('--- {} ---'.format(title))
print(' OpenCV version {} '.format(cv2.__version__))
print(NOCOLOR)
# Parses arguments for the application
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--image', metavar = 'IMAGE_FILE', type = str, default = DEF_INPUT_FILE,
help = 'Absolute path to image file. Default value is \'' + DEF_INPUT_FILE + '\'')
parser.add_argument('-l', '--language', metavar = 'LANGUAGE',
default = 'jpn',
help = 'Language. Default value is \'jpn\'')
parser.add_argument('--layout', metavar = 'LAYOUT',
default = 6,
help = 'Tesseract layout Default value is 6')
parser.add_argument('-t', '--title', metavar = 'TITLE',
default = 'y',
help = 'Program title flag.(y/n) Default value is \'y\'')
parser.add_argument('--log', metavar = 'LOG',
default = 'y',
help = 'Log flag.(y/n) Default value is \'y\'')
return parser
# モデル基本情報の表示
def display_info(image, lang, layout, titleflg, logflg):
print(YELLOW + title + ': Starting application...' + NOCOLOR)
print(' - ' + YELLOW + 'Image File : ' + NOCOLOR, image)
print(' - ' + YELLOW + 'Language : ' + NOCOLOR, lang)
print(' - ' + YELLOW + 'Layout : ' + NOCOLOR, layout)
print(' - ' + YELLOW + 'Program Title: ' + NOCOLOR, titleflg)
print(' - ' + YELLOW + 'Log flag : ' + NOCOLOR, logflg)
# 画像注釈
def image_annotation(lena_frame_org, lang='jpn', layout=6, titleflg=False, logflag=False):
WINDOW_NAME = title
ROI_WINDOW = 'Cut-out area'
ROI_POPUP = 'OCR detection result Text'
preprocess_mode = 0x0
wlock1 = 0
wlock2 = 0
wlock3 = 0
# 日本語フォント指定
fontPIL = 'NotoSansCJK-Bold.ttc'
# ディスプレイ解像度を得る
monitor_height, monitor_width = mylib_screen.get_display_size(logflag)
maxsize = monitor_height - 50
# 画像の前処理
imgpros = mylib_preprocess.ImagePreprocess(False) # 初期化
# OCR
tools = pyocr.get_available_tools()
if len(tools) == 0:
print(RED + "\nOCR tool Not found." + NOCOLOR)
quit()
tool = tools[0]
# mylib_frame ライブラリ
imgfr = mylib_frame.ImageFrame(lena_frame_org) # 初期化
imgfr.set_screen_size(monitor_width, monitor_height)
lena_frame = imgfr.frame_resize(maxsize)
frame = np.zeros(lena_frame.shape, np.uint8)
popup_frame = np.zeros((120, 500, 3), np.uint8)
anchor = cvui.Point()
roi = cvui.Rect(0, 0, 0, 0)
working = False
frame_h, frame_w = frame.shape[:2]
outf = False
org_h, org_w = imgfr.get_original_size()
scale_h, scale_w = imgfr.get_scale()
if logflag:
print('\n original w x h : {:=5} x {:=5}'.format(org_h, org_w))
print(' display w x h : {:=5} x {:=5}'.format(frame_h, frame_w))
print(' scale w x h : {:.3f} x {:.3f}'.format(scale_h, scale_w))
print(' -----------')
# Init cvui and tell it to create a OpenCV window, i.e. cv.namedWindow(WINDOW_NAME).
cv2.namedWindow(WINDOW_NAME, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cvui.init(WINDOW_NAME)
while (True):
# Fill the frame with Lena's image
frame[:] = lena_frame[:]
# Show the coordinates of the mouse pointer on the screen
cvui.text(frame, 10, 10, 'Click mouse left-button and drag the pointer around to select a cut area.')
# マウス・イベント
if cvui.mouse(cvui.LEFT_BUTTON, cvui.DOWN):
# マウスポインタにアンカーを配置
anchor.x = cvui.mouse().x
anchor.y = cvui.mouse().y
# 作業中の通知(作業中はウインドウの更新しない)
working = True
if cvui.mouse(cvui.LEFT_BUTTON, cvui.IS_DOWN):
# 領域を設定
width = cvui.mouse().x - anchor.x
height = cvui.mouse().y - anchor.y
roi.x = anchor.x + width if width < 0 else anchor.x
roi.y = anchor.y + height if height < 0 else anchor.y
roi.width = abs(width)
roi.height = abs(height)
# 座標とサイズを表示
cvui.printf(frame, roi.x + 5, roi.y + 5, 0.3, 0xff0000, '(%d,%d)', roi.x, roi.y)
cvui.printf(frame, cvui.mouse().x + 5, cvui.mouse().y + 5, 0.3, 0xff0000, 'w:%d, h:%d', roi.width, roi.height)
if cvui.mouse(cvui.UP):
# 領域指定作業の終了
working = False
outf = True
wlock1 = 0
wlock2 = 0
wlock3 = 0
# 領域内を確認
lenaRows, lenaCols, lenaChannels = lena_frame.shape
roi.x = 0 if roi.x < 0 else roi.x
roi.y = 0 if roi.y < 0 else roi.y
roi.width = roi.width + lena_frame.cols - (roi.x + roi.width) if roi.x + roi.width > lenaCols else roi.width
roi.height = roi.height + lena_frame.rows - (roi.y + roi.height) if roi.y + roi.height > lenaRows else roi.height
# 設定領域をレンダリング
cvui.rect(frame, roi.x, roi.y, roi.width, roi.height, 0xff0000)
# タイトル描画
if (titleflg == 'y'):
cv2.putText(frame, title, (10, 30), cv2.FONT_HERSHEY_DUPLEX, fontScale=0.8, color=(200, 200, 0), lineType=cv2.LINE_AA)
# ウインドウの更新
cvui.update()
# 画面の表示
cv2.imshow(WINDOW_NAME, frame)
if wlock1 < 10:
cv2.moveWindow(WINDOW_NAME, 80, 0)
wlock1 = wlock1 + 1
else:
wlock1 = 10
# 得られた表示座標から元画像の位置を計算して画像を切り出す
if roi.area() > 50 and working == False:
x0, y0 = imgfr.get_res2org_xy(roi.x, roi.y)
x1, y1 = imgfr.get_res2org_xy(roi.x + roi.width, roi.y + roi.height)
lenaRoi = lena_frame_org[y0 : y1, x0 : x1]
# 前処理
prs_color = [(0,0,0), (0,0,0), (0,0,0)]
if preprocess_mode & 0x4 != 0:
lenaRoi = imgpros.image_processing_execution(lenaRoi, 4)
prs_color[2] = (0,0,255)
if preprocess_mode & 0x2 != 0:
lenaRoi = imgpros.image_processing_execution(lenaRoi, 2)
prs_color[1] = (0,0,255)
if preprocess_mode & 0x1 != 0:
lenaRoi = imgpros.image_processing_execution(lenaRoi, 1)
prs_color[0] = (0,0,255)
# OCR入力画像表示
lenaRoi_h, lenaRoi_w = lenaRoi.shape[:2]
img_Roi = np.zeros((lenaRoi_h + 30, lenaRoi_w, 3), np.uint8)
img_Roi[:,:,:] = 200
img_Roi[30:lenaRoi_h + 30,:] = lenaRoi
## 前処理モード表示 (サイズ2段階,エリアがないときは表示しない)
if lenaRoi_w > 320:
fs = 16
xs = 80
else:
fs = 10
xs = 50
if xs*3 < lenaRoi_w:
myfunction.cv2_putText(img_Roi, '前処理:', (10,4), fontPIL, fs, (100,100,100), 1)
myfunction.cv2_putText(img_Roi, '白黒2値', (10+xs,4), fontPIL, fs, prs_color[0], 1)
myfunction.cv2_putText(img_Roi, '罫線消去', (10+xs*2,4), fontPIL, fs, prs_color[1], 1)
myfunction.cv2_putText(img_Roi, '印影消去', (10+xs*3,4), fontPIL, fs, prs_color[2], 1)
##
cv2.namedWindow(ROI_WINDOW, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_WINDOW, img_Roi)
if wlock2 < 10:
cv2.moveWindow(ROI_WINDOW, frame_w + 100, 0)
wlock2 = wlock2 + 1
else:
wlock2 = 10
# 切り出した領域を OCR
# PILのイメージにする
lenaRoi1 = cv2.cvtColor(lenaRoi, cv2.COLOR_RGB2BGR)
imgRoi = Image.fromarray(lenaRoi1)
# txt is a Python string
txt = tool.image_to_string(imgRoi, lang=lang,
builder=pyocr.builders.TextBuilder(tesseract_layout=layout))
# テキストの描画
if len(txt)>0:
popup_frame[:,:,:] = 0
cv2.rectangle(popup_frame, (0, 88), (500, 105), (255,0,0), -1)
myfunction.cv2_putText(img = popup_frame,
text = txt,
org = (15, 104),
fontFace = fontPIL,
fontScale = 12,
color = (255,255,255),
mode = 0)
cv2.namedWindow(ROI_POPUP, flags=cv2.WINDOW_AUTOSIZE | cv2.WINDOW_GUI_NORMAL)
cv2.imshow(ROI_POPUP, popup_frame)
if wlock3 < 10:
cv2.moveWindow(ROI_POPUP, frame_w + 100, lenaRoi_h + 100)
wlock3 = wlock3 + 1
else:
wlock3 = 10
if outf and logflag:
print(' ', txt)
print(' preprocess: {} area: ({}, {}) - ({}, {})'.format(preprocess_mode, x0, y0, x1, y1))
outf = False
key = cv2.waitKey(1)
if key == 27 or key == 113: # 'esc' or 'q'
break
elif key >= ord('0') and key <= ord('7'): # 前処理モード変更
preprocess_mode = key - ord('0')
outf = True
if not mylib_gui._is_visible(title): # 'Close' button
break
cv2.destroyAllWindows()
if logflag:
print(' -----------\n')
return key
# ** main関数 **
def main():
loop_flg = True
# Argument parsing and parameter setting
ARGS = parse_args().parse_args()
filename = ARGS.image
lang = ARGS.language
layout = int(ARGS.layout)
titleflg = ARGS.title
logflg = ARGS.log
logflag = True if logflg == 'y' else False
# 情報表示
display_info(filename, lang, layout, titleflg, logflg)
while(loop_flg):
if logflag:
print('\n file: <{}>'.format(filename))
# OpenCV でイメージを読む
frame = cv2.imread(filename)
if frame is None:
print(RED + "\nUnable to read the input." + NOCOLOR)
quit()
# 画像注釈
ret = image_annotation(frame, lang, layout, titleflg, logflag)
# if ret == 27 or ret == 113 or ret == -1:
# loop_flg = False
# 画像ファイルの選択
filename = filedialog.askopenfilename(
title = "画像ファイルを開く",
filetypes = [("Image file", ".bmp .png .jpg .tif"),
("Bitmap", ".bmp"),
("PNG", ".png"),
("JPEG", ".jpg")], # ファイルフィルタ
initialdir = "./" # 自分自身のディレクトリ
)
if len(filename) == 0:
break
if logflag:
print('\n Finished.')
# main関数エントリーポイント(実行開始)
if __name__ == "__main__":
sys.exit(main())
→ 以降「OCR アプリケーション基礎編 2」へ続く
PukiWiki 1.5.4 © 2001-2022 PukiWiki Development Team. Powered by PHP 7.4.33. HTML convert time: 0.054 sec.










