:Event: PyCon APAC 2022
:Presented: 2022/07/20 (pre-recorded) nikkie
Many thanks to all the staff who worked so hard❤️
Reads aloud the spoken texts.
Technologies behind Shion v0.0.1
A human inputs voice.
say
command in macOS¶say -v <voice> <text>
!say -v Kyoko いま、幸せ?
say -v ?
: obtain a list of voices¶!say -v Mei-Jia 你好
say
command from Python¶subprocess
in standard librarysubprocess.run
¶Example
subprocess.run(["ls", "-l"]) # Call `ls -l`
subprocess.run
¶import subprocess
subprocess.run(["say", "-v", "Kyoko", "いま、幸せ?"])
CompletedProcess(args=['say', '-v', 'Kyoko', 'いま、幸せ?'], returncode=0)
import readline # noqa: F401
import subprocess
def say(sentence: str):
subprocess.run(["say", "-v", "Kyoko", sentence])
if __name__ == "__main__":
while True:
sentence = input("読み上げたい文を入力してください (qで終了): ")
stripped = sentence.strip()
if not stripped:
continue
if stripped.lower() == "q":
break
say(stripped)
SpeechRecognition
¶SpeechRecognition
¶import speech_recognition as sr
r = sr.Recognizer()
with sr.Microphone(sample_rate=16_000) as source:
print("なにか話してください")
audio = r.listen(source)
print("音声を取得しました")
なにか話してください 音声を取得しました
Select Google Cloud Speech-to-Text API
import os
with open(os.environ.get("SPEECH_TO_TEXT_API_SERVICE_ACCOUNT_KEY")) as f:
credentials = f.read()
recognized_text = r.recognize_google_cloud(
audio, credentials, language="ja-JP"
)
print(recognized_text.strip())
こんにちは
import argparse
import speech_recognition as sr
def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
with sr.Microphone(sample_rate=16_000) as source:
print("なにか話してください")
audio = recognizer.listen(source)
print("音声を取得しました")
return audio
def recognize_speech(
recognizer: "sr.Recognizer", audio: "sr.AudioData", credentials: str
) -> str:
recognized_text = recognizer.recognize_google_cloud(
audio, credentials, language="ja-JP"
)
return recognized_text.strip()
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("credentials_path")
args = parser.parse_args()
with open(args.credentials_path) as f:
credentials = f.read()
r = sr.Recognizer()
while True:
audio = input_from_microphone(r)
text = recognize_speech(r, audio, credentials)
print(text)
character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
if character.strip().lower() == "q":
break
def talk_with_chatbot(sentence: str) -> str:
return sentence
subprocess.run
say
command depends on the OSttslearn
¶from ttslearn.dnntts import DNNTTS
dnntts_engine = DNNTTS()
audio_array, sampling_rate = dnntts_engine.tts("いま、幸せ?")
DNNTTS()
¶tts
method returns NumPy array representing audio datasounddevice
¶Play and Record Sound with Python
import sounddevice as sd
sd.play(audio_array, sampling_rate)
sd.wait()
import readline # noqa: F401
import sounddevice as sd
from ttslearn.dnntts import DNNTTS
dnntts_engine = DNNTTS()
def say(sentence: str):
audio_array, sampling_rate = dnntts_engine.tts(sentence)
sd.play(audio_array, sampling_rate)
sd.wait()
if __name__ == "__main__":
while True:
sentence = input("読み上げたい文を入力してください (qで終了): ")
stripped = sentence.strip()
if not stripped:
continue
if stripped.lower() == "q":
break
say(stripped)
ESPnet
¶end-to-end speech processing toolkit
Use the model published on Hugging Face
pip install espnet-model-zoo
from espnet2.bin.asr_inference import Speech2Text
speech2text = Speech2Text.from_pretrained(
"kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)
SoundFile
¶an audio library based on libsndfile, CFFI and NumPy.
say
command¶!say -v Kyoko いま、幸せ? -o sample.wav --data-format=LEF32@16000
@16000
means the sampling rate (ref: man say
)import soundfile as sf
speech_array, sampling_rate = sf.read("sample.wav")
nbests = speech2text(speech_array)
text, tokens, *_ = nbests[0]
print(text)
今幸せ
SpeechRecognition
again¶r = sr.Recognizer()
with sr.Microphone(sample_rate=16_000) as source:
print("なにか話してください")
audio = r.listen(source)
print("音声を取得しました")
なにか話してください 音声を取得しました
wav_bytes = audio.get_wav_data()
type(wav_bytes)
bytes
from io import BytesIO
wav_stream = BytesIO(wav_bytes)
speech_array, sampling_rate = sf.read(wav_stream)
type(speech_array)
numpy.ndarray
nbests = speech2text(speech_array)
text, tokens, *_ = nbests[0]
print(text)
えー今幸せ
from io import BytesIO
import numpy as np
import soundfile as sf
import speech_recognition as sr
from espnet2.bin.asr_inference import Speech2Text
speech2text = Speech2Text.from_pretrained(
"kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)
SAMPLING_RATE_HZ = 16_000
def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
with sr.Microphone(sample_rate=SAMPLING_RATE_HZ) as source:
print("なにか話してください")
audio = recognizer.listen(source)
print("音声を取得しました")
return audio
def convert_to_array(audio: "sr.AudioData") -> "np.array":
wav_bytes = audio.get_wav_data()
wav_stream = BytesIO(wav_bytes)
audio_array, sampling_rate = sf.read(wav_stream)
assert sampling_rate == SAMPLING_RATE_HZ
return audio_array
def recognize_speech(audio_array: "np.array") -> str:
nbests = speech2text(audio_array)
text, tokens, *_ = nbests[0]
return text
if __name__ == "__main__":
r = sr.Recognizer()
while True:
audio = input_from_microphone(r)
array = convert_to_array(audio)
text = recognize_speech(array)
print(text)
character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
if character.strip().lower() == "q":
break
from io import BytesIO
import numpy as np
import sounddevice as sd
import soundfile as sf
import speech_recognition as sr
from espnet2.bin.asr_inference import Speech2Text
from ttslearn.dnntts import DNNTTS
speech2text = Speech2Text.from_pretrained(
"kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)
SAMPLING_RATE_HZ = 16_000
def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
with sr.Microphone(sample_rate=SAMPLING_RATE_HZ) as source:
print("なにか話してください")
audio = recognizer.listen(source)
print("音声を取得しました")
return audio
def convert_to_array(audio: "sr.AudioData") -> "np.array":
wav_bytes = audio.get_wav_data()
wav_stream = BytesIO(wav_bytes)
audio_array, sampling_rate = sf.read(wav_stream)
assert sampling_rate == SAMPLING_RATE_HZ
return audio_array
def recognize_speech(audio_array: "np.array") -> str:
nbests = speech2text(audio_array)
text, tokens, *_ = nbests[0]
return text
def recognize_mircophone_input(recognizer: "sr.Recognizer") -> str:
audio = input_from_microphone(recognizer)
array = convert_to_array(audio)
return recognize_speech(array)
def process_text(sentence: str) -> str:
return sentence
dnntts_engine = DNNTTS()
def say(sentence: str):
audio_array, sampling_rate = dnntts_engine.tts(sentence)
sd.play(audio_array, sampling_rate)
sd.wait()
if __name__ == "__main__":
r = sr.Recognizer()
while True:
text = recognize_mircophone_input(r)
response = process_text(text)
say(response)
character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
if character.strip().lower() == "q":
break
Validated system (Shion) piecing together quick implementations
pip install
ing libraries, we can download and use pre-trained models for machine learning tasks!shion.py
If part of what you want to create can be viewed as a machine learning task, the following approaches can also be used
I would be happy to provide a little inspiration for your Maker project.