# -*- coding: utf-8 -*-
##------------------------------------------
## One Shot Talking Face (GUI) Ver 0.01
##
## 2024.10.16 Masahiro Izutsu
##------------------------------------------
## talk_face.py
import warnings
warnings.simplefilter('ignore')
# Color Escape Code ---------------------------
GREEN = '\033[1;32m'
RED = '\033[1;31m'
NOCOLOR = '\033[0m'
YELLOW = '\033[1;33m'
CYAN = '\033[1;36m'
BLUE = '\033[1;34m'
# インポート&初期設定
import os
import argparse
import subprocess
import platform
import cv2
import PySimpleGUI as sg
import tkinter as tk
from PIL import Image, ImageTk
import my_logging
import my_movieplay
import my_thumbnail
from torch.cuda import is_available
gpu_d = is_available() # GPU 確認
# 定数定義
DEF_AUDIO = './select/audios/obama2.wav'
DEF_IMAGE = './select/images/d5.jpg'
RESULT_PATH = './results'
RESULT_PHONE = './results/phone'
DEF_THEME = 'BlueMono'
KEY_IMGFILE = '-ImgFile-'
KEY_WAVFILE = '-WavFile-'
KEY_TALKFILE = '-TalkFile-'
KEY_TXTIMG = '-Video-'
KEY_TALKFILE2 = '-TalkFile2-'
KEY_VIDEO = '-Image-'
KEY_EXIT = '-Exit-'
KEY_PAGE = '-Page-'
KEY_PAGEUP = '-PageUp-'
KEY_PAGEDOWN = '-PageDown-'
THUMB_SIZE = 64
GAP_SIZE = 4
CANVAS_XN = 8
CANVAS_YN = 5
# タイトル
title = 'One Shot Talking Face (GUI) Ver. 0.01'
sub_title = ''
# Parses arguments for the application
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--audio_file", default=DEF_AUDIO, help="path to audio file")
parser.add_argument("--source_image", default=DEF_IMAGE, help="path to source image")
parser.add_argument("--result_path", default=RESULT_PATH, help="path to output")
parser.add_argument("--cpu", dest="cpu", action="store_true", help="cpu mode.")
parser.add_argument('--log', metavar = 'LOG', default = '3', help = 'Log level(-1/0/1/2/3/4/5) Default value is \'3\'')
return parser
# 基本情報の表示
def display_info(args, title):
print('\n' + GREEN + title + ': Starting application...' + NOCOLOR)
print(' - ' + YELLOW + 'audio_file : ' + NOCOLOR, args.audio_file)
print(' - ' + YELLOW + 'source_dir : ' + NOCOLOR, args.source_image)
print(' - ' + YELLOW + 'result_path : ' + NOCOLOR, args.result_path)
print(' - ' + YELLOW + 'cpu : ' + NOCOLOR, args.cpu)
print(' - ' + YELLOW + 'log : ' + NOCOLOR, args.log)
print(' ')
# talking video ファイル名を得る
def get_talk_file(out_path, image_file, wave_file):
base_dir_pair = os.path.split(image_file)
i_name, _ = os.path.splitext(base_dir_pair[1])
base_dir_pair = os.path.split(wave_file)
a_name, _ = os.path.splitext(base_dir_pair[1])
path = out_path + '/' + i_name + '_' + a_name + '.mp4'
flg = os.path.exists(path)
return path, flg
# ** wavファイルから jsonファイルを作成 **
def wav2json(wave_file, logger):
base_dir_pair = os.path.split(wave_file)
s_name, _ = os.path.splitext(base_dir_pair[1])
path = RESULT_PHONE + '/' + s_name + '.json'
if not os.path.exists(path):
jq_param = '[.w[]|{word: (.t | ascii_upcase | sub("<S>"; "sil") | sub("<SIL>"; "sil") | sub("\\\(2\\\)"; "") | sub("\\\(3\\\)"; "") | sub("\\\(4\\\)"; "") | sub("\\\[SPEECH\\\]"; "SIL") | sub("\\\[NOISE\\\]"; "SIL")), phones: [.w[]|{ph: .t | sub("\\\+SPN\\\+"; "SIL") | sub("\\\+NSN\\\+"; "SIL"), bg: (.b*100)|floor, ed: (.b*100+.d*100)|floor}]}]'
command = f"pocketsphinx -phone_align yes single {wave_file} $text | jq '{jq_param}' > {path}"
logger.debug(command)
os.system(command)
logger.info(f'save jason to: {path}')
return path
# ** 静止画像から talking 画像を作成 **
def image2talk(image_file, wave_file, phone_path, out_path, logger):
import config
from test_script2 import test_with_input_audio_and_image2
base_dir_pair = os.path.split(image_file)
i_name, _ = os.path.splitext(base_dir_pair[1])
base_dir_pair = os.path.split(wave_file)
a_name, _ = os.path.splitext(base_dir_pair[1])
path = out_path + '/' + i_name + '_' + a_name + '.mp4'
if not os.path.exists(path):
test_with_input_audio_and_image2(image_file, wave_file, phone_path, config.GENERATOR_CKPT, config.AUDIO2POSE_CKPT, out_path, False)
return path
# ファイルパスからファイル名を得る
def path2filename(path):
base_dir_pair = os.path.split(path)
filename = base_dir_pair[1]
return filename
# main プロセス
def main_process(opt, logger):
title_a = f'One Shot Talking Face {sub_title}'
# Thumbnail オブジェクト作成
Thumb = my_thumbnail.Thumbnail(CANVAS_XN, CANVAS_YN, THUMB_SIZE, GAP_SIZE, ['.jpg', '.png', '.bmp'])
image_file = opt.source_image
frame = Thumb.initialize(image_file)
image_file = Thumb.get_sel_file()
result_path = opt.result_path
audio_file = opt.audio_file
audio_files, audio_dir, sel_audio = Thumb.get_file_list(audio_file, ['.wav'])
logger.debug(f'audo_files = {audio_files}')
# ウィンドウのテーマ
sg.theme(DEF_THEME)
canvas = sg.Image(size = Thumb.get_canvas_size(), key='CANVAS')
# ウィンドウのレイアウト
col_left = [
[sg.Text("Audio File select:", size=(20, 1))],
[sg.Listbox(audio_files, key='-AudioList-', size = (20, 10), default_values = sel_audio, enable_events=True)],
[sg.Text("audio file:", size=(20, 1))],
[sg.Text(path2filename(audio_file), background_color='White', size=(20, 1), key = KEY_WAVFILE)],
[sg.Text("image file:", size=(20, 1))],
[sg.Text(path2filename(image_file), background_color='White', size=(20, 1), key = KEY_IMGFILE)],
[sg.Text("Talking Video:", size=(20, 1))],
[sg.Text("", background_color='White', size=(20, 1), key = KEY_TALKFILE)],
]
col_right = [
[sg.Text("Image file select:", size=(12, 1)), sg.Text(image_file, background_color='LightSteelBlue1', size=(40, 1), key = KEY_TXTIMG)],
[canvas],
]
col_btn = [
[
sg.Text('', background_color='LightSteelBlue1', size=(28, 1), key = KEY_TALKFILE2),
sg.Text("", size=(4, 1)),
sg.Text("Page: 1/1", size=(12, 1), key=KEY_PAGE),
sg.Button('▼', size=(2, 1), key=KEY_PAGEUP),
sg.Button('▲', size=(2, 1), key=KEY_PAGEDOWN),
sg.Text("", size=(4, 1)),
sg.Button('Video', size=(8, 1), key=KEY_VIDEO),
sg.Button('Exit', size=(8, 1), key=KEY_EXIT),
sg.Text("", size=(1, 1))
]
]
layout = [[sg.Text("", size=(1, 1)), sg.Text(title_a, size=(34, 1), justification='left', font='Helvetica 16')],
[sg.Column(col_left, vertical_alignment='top'), sg.Column(col_right, vertical_alignment='top')],
[sg.Column(col_btn, justification='r') ],
]
# ウィンドウオブジェクトの作成
window = sg.Window(title, layout, finalize=True, return_keyboard_events=True, use_default_focus=False)
img = cv2.imencode('.png', frame)[1].tobytes()
window['CANVAS'].update(img)
# ユーザーイベントの定義
canvas.bind('<Motion>', '_motion')
canvas.bind('<ButtonPress>', '_click_on')
canvas.bind('<ButtonRelease>', '_click_off')
canvas.bind('<Double-Button>', '_double_click')
page_offset, page_max = Thumb.get_page_max()
window[KEY_PAGE].update(f'Page: {page_offset + 1}/{page_max}')
window[KEY_PAGEUP].update(disabled = not Thumb.check_page_up())
window[KEY_PAGEDOWN].update(disabled = not Thumb.check_page_down())
new_make_f = False
video_play_f = False
sel_video = ''
# イベントのループ
while True:
event, values = window.read(timeout=30)
if new_make_f:
logger.info('New Talking Video making...')
phone_file = wav2json(audio_file, logger)
logger.debug(f'phone_file → {phone_file}')
video_file = image2talk(image_file, audio_file, phone_file, result_path, logger)
logger.info(f'New Talking Video → {sel_video}')
video_play_f = True
if video_play_f:
my_movieplay.movie_play(sel_video, sel_video)
window[KEY_EXIT].update(disabled = False)
video_play_f = False
new_make_f = False
if event == KEY_EXIT or event == sg.WIN_CLOSED:
break
if event == KEY_PAGEUP:
logger.debug(f'{event}')
frame = Thumb.page_up()
if frame is not None:
window[KEY_TXTIMG].update('')
img = cv2.imencode('.png', frame)[1].tobytes()
window['CANVAS'].update(img)
page_offset, page_max = Thumb.get_page_max()
window[KEY_PAGE].update(f'Page: {page_offset + 1}/{page_max}')
window[KEY_PAGEUP].update(disabled = not Thumb.check_page_up())
window[KEY_PAGEDOWN].update(disabled = not Thumb.check_page_down())
if event == KEY_PAGEDOWN:
logger.debug(f'{event}')
frame = Thumb.page_down()
if frame is not None:
window[KEY_TXTIMG].update('')
img = cv2.imencode('.png', frame)[1].tobytes()
window['CANVAS'].update(img)
page_offset, page_max = Thumb.get_page_max()
window[KEY_PAGE].update(f'Page: {page_offset + 1}/{page_max}')
window[KEY_PAGEUP].update(disabled = not Thumb.check_page_up())
window[KEY_PAGEDOWN].update(disabled = not Thumb.check_page_down())
if event == 'CANVAS_motion':
x = canvas.user_bind_event.x
y = canvas.user_bind_event.y
filename, frame = Thumb.pixel2file(x, y)
image_file0 = Thumb.filename2path(filename)
window[KEY_TXTIMG].update(filename)
img = cv2.imencode('.png', frame)[1].tobytes()
window['CANVAS'].update(img)
video_file, video_f = get_talk_file(result_path, image_file0, audio_file)
window[KEY_VIDEO].update(disabled = sel_video == '' and not gpu_d)
if video_f:
base_dir_pair = os.path.split(video_file)
name = base_dir_pair[1]
else:
name = ''
video_file = ''
window[KEY_TALKFILE2].update(name)
if event == 'CANVAS_click_on':
image_file, frame = Thumb.select_file()
window[KEY_IMGFILE].update(path2filename(image_file))
img = cv2.imencode('.png', frame)[1].tobytes()
window['CANVAS'].update(img)
logger.debug(f'{event} {image_file}')
if video_f:
base_dir_pair = os.path.split(video_file)
name = base_dir_pair[1]
else:
name = ''
video_file = ''
sel_video = video_file
window[KEY_TALKFILE].update(name)
window[KEY_VIDEO].update(disabled = sel_video == '' and not gpu_d)
if event == KEY_VIDEO:
if os.path.exists(sel_video):
video_play_f = True
logger.debug(video_file)
window[KEY_VIDEO].update(disabled = True)
window[KEY_EXIT].update(disabled = True)
elif gpu_d:
window[KEY_EXIT].update(disabled = True)
window[KEY_TALKFILE].update('Talk Video making...')
new_make_f = True
if event == 'CANVAS_double_click':
if os.path.exists(sel_video):
video_play_f = True
logger.debug(video_file)
window[KEY_VIDEO].update(disabled = True)
window[KEY_EXIT].update(disabled = True)
elif gpu_d:
window[KEY_EXIT].update(disabled = True)
window[KEY_TALKFILE].update('Talk Video making...')
new_make_f = True
if event == 'CANVAS_click_off':
pass
if event == '-AudioList-':
name = values['-AudioList-'][0]
window[KEY_WAVFILE].update(name)
audio_file = os.path.join(audio_dir, name)
video_file, video_f = get_talk_file(result_path, image_file, audio_file)
if video_f:
base_dir_pair = os.path.split(video_file)
name = base_dir_pair[1]
else:
name = ''
video_file = ''
sel_video = video_file
window[KEY_TALKFILE].update(name)
window[KEY_TALKFILE2].update('')
window[KEY_VIDEO].update(disabled = True)
window[KEY_EXIT].update(disabled = True)
if platform.system()=='Windows':
cmd = "python wav_play.py " + audio_file
pro = subprocess.Popen(cmd)
sg.popup_no_buttons(audio_file, background_color='#ffffff', auto_close=True, auto_close_duration=5, no_titlebar=True)
pro.terminate()
else:
cmd = "python wav_play.py " + audio_file
pro = subprocess.Popen('exec ' + cmd, shell = True)
sg.popup_no_buttons(audio_file, background_color='#ffffff', auto_close=True, auto_close_duration=5, no_titlebar=True)
pro.kill()
window[KEY_VIDEO].update(disabled = not video_f and not gpu_d)
window[KEY_EXIT].update(disabled = False)
logger.debug(audio_file)
# ウィンドウ終了処理
window.close()
# main関数エントリーポイント(実行開始)
if __name__ == "__main__":
import platform
parser = parse_args()
opt = parser.parse_args()
# アプリケーション・ログ設定
module = os.path.basename(__file__)
module_name = os.path.splitext(module)[0]
logger = my_logging.get_module_logger_sel(module_name, int(opt.log))
if opt.cpu or platform.system()=='Windows':
gpu_d = False
sub_title = '' if gpu_d else '<view mode>'
display_info(opt, title)
main_process(opt, logger)
msg = '処理結果一覧: ' + os.getcwd() + opt.result_path[1:]
my_thumbnail.file_dialog(file_path=opt.result_path, title=msg, theme=DEF_THEME, xn=10, yn=4, thumb_size=128, gap=4, ret='Exit', audio_f=True, logger=logger)
logger.info('\nFinished.\n')
# -*- coding: utf-8 -*-
##------------------------------------------
## Wave File Play audio Ver 0.01
##
## 2024.10.22 Masahiro Izutsu
##------------------------------------------
## wav_play.py
import sys
import wave
import pyaudio
args = sys.argv
wf = wave.open(str(args[1]), "r")
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
output=True)
# チャンク単位でストリームに出力し音声を再生
chunk = 1024
data = wf.readframes(chunk)
while data != b'':
stream.write(data)
data = wf.readframes(chunk)
stream.close()
p.terminate()