!say -v Kyoko いま、幸せ？


!say -v Mei-Jia 你好

subprocess.run(["ls", "-l"])  # Call `ls -l`


import subprocess


subprocess.run(["say", "-v", "Kyoko", "いま、幸せ？"])

CompletedProcess(args=['say', '-v', 'Kyoko', 'いま、幸せ？'], returncode=0)

import readline  # noqa: F401
import subprocess


def say(sentence: str):
    subprocess.run(["say", "-v", "Kyoko", sentence])


if __name__ == "__main__":
    while True:
        sentence = input("読み上げたい文を入力してください (qで終了): ")
        stripped = sentence.strip()
        if not stripped:
            continue
        if stripped.lower() == "q":
            break

        say(stripped)


import speech_recognition as sr


r = sr.Recognizer()


with sr.Microphone(sample_rate=16_000) as source:
    print("なにか話してください")
    audio = r.listen(source)
    print("音声を取得しました")

なにか話してください
音声を取得しました


import os


with open(os.environ.get("SPEECH_TO_TEXT_API_SERVICE_ACCOUNT_KEY")) as f:
    credentials = f.read()


recognized_text = r.recognize_google_cloud(
    audio, credentials, language="ja-JP"
)
print(recognized_text.strip())

こんにちは

import argparse

import speech_recognition as sr


def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
    with sr.Microphone(sample_rate=16_000) as source:
        print("なにか話してください")
        audio = recognizer.listen(source)
        print("音声を取得しました")
        return audio


def recognize_speech(
    recognizer: "sr.Recognizer", audio: "sr.AudioData", credentials: str
) -> str:
    recognized_text = recognizer.recognize_google_cloud(
        audio, credentials, language="ja-JP"
    )
    return recognized_text.strip()


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("credentials_path")
    args = parser.parse_args()

    with open(args.credentials_path) as f:
        credentials = f.read()

    r = sr.Recognizer()

    while True:
        audio = input_from_microphone(r)
        text = recognize_speech(r, audio, credentials)
        print(text)

        character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
        if character.strip().lower() == "q":
            break

def talk_with_chatbot(sentence: str) -> str:
    return sentence


from ttslearn.dnntts import DNNTTS


dnntts_engine = DNNTTS()


audio_array, sampling_rate = dnntts_engine.tts("いま、幸せ？")


import sounddevice as sd


sd.play(audio_array, sampling_rate)
sd.wait()

import readline  # noqa: F401

import sounddevice as sd
from ttslearn.dnntts import DNNTTS

dnntts_engine = DNNTTS()


def say(sentence: str):
    audio_array, sampling_rate = dnntts_engine.tts(sentence)
    sd.play(audio_array, sampling_rate)
    sd.wait()


if __name__ == "__main__":
    while True:
        sentence = input("読み上げたい文を入力してください (qで終了): ")
        stripped = sentence.strip()
        if not stripped:
            continue
        if stripped.lower() == "q":
            break

        say(stripped)


from espnet2.bin.asr_inference import Speech2Text


speech2text = Speech2Text.from_pretrained(
    "kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)


!say -v Kyoko いま、幸せ？ -o sample.wav --data-format=LEF32@16000


import soundfile as sf


speech_array, sampling_rate = sf.read("sample.wav")


nbests = speech2text(speech_array)
text, tokens, *_ = nbests[0]
print(text)

今幸せ


r = sr.Recognizer()
with sr.Microphone(sample_rate=16_000) as source:
    print("なにか話してください")
    audio = r.listen(source)
    print("音声を取得しました")

なにか話してください
音声を取得しました


wav_bytes = audio.get_wav_data()
type(wav_bytes)

bytes


from io import BytesIO


wav_stream = BytesIO(wav_bytes)
speech_array, sampling_rate = sf.read(wav_stream)
type(speech_array)

numpy.ndarray


nbests = speech2text(speech_array)
text, tokens, *_ = nbests[0]
print(text)

えー今幸せ

from io import BytesIO

import numpy as np
import soundfile as sf
import speech_recognition as sr
from espnet2.bin.asr_inference import Speech2Text

speech2text = Speech2Text.from_pretrained(
    "kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)

SAMPLING_RATE_HZ = 16_000


def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
    with sr.Microphone(sample_rate=SAMPLING_RATE_HZ) as source:
        print("なにか話してください")
        audio = recognizer.listen(source)
        print("音声を取得しました")
        return audio


def convert_to_array(audio: "sr.AudioData") -> "np.array":
    wav_bytes = audio.get_wav_data()
    wav_stream = BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    assert sampling_rate == SAMPLING_RATE_HZ
    return audio_array


def recognize_speech(audio_array: "np.array") -> str:
    nbests = speech2text(audio_array)
    text, tokens, *_ = nbests[0]
    return text


if __name__ == "__main__":
    r = sr.Recognizer()

    while True:
        audio = input_from_microphone(r)
        array = convert_to_array(audio)
        text = recognize_speech(array)
        print(text)

        character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
        if character.strip().lower() == "q":
            break

from io import BytesIO

import numpy as np
import sounddevice as sd
import soundfile as sf
import speech_recognition as sr
from espnet2.bin.asr_inference import Speech2Text
from ttslearn.dnntts import DNNTTS

speech2text = Speech2Text.from_pretrained(
    "kan-bayashi/csj_asr_train_asr_transformer_raw_char_sp_valid.acc.ave"
)

SAMPLING_RATE_HZ = 16_000


def input_from_microphone(recognizer: "sr.Recognizer") -> "sr.AudioData":
    with sr.Microphone(sample_rate=SAMPLING_RATE_HZ) as source:
        print("なにか話してください")
        audio = recognizer.listen(source)
        print("音声を取得しました")
        return audio


def convert_to_array(audio: "sr.AudioData") -> "np.array":
    wav_bytes = audio.get_wav_data()
    wav_stream = BytesIO(wav_bytes)
    audio_array, sampling_rate = sf.read(wav_stream)
    assert sampling_rate == SAMPLING_RATE_HZ
    return audio_array


def recognize_speech(audio_array: "np.array") -> str:
    nbests = speech2text(audio_array)
    text, tokens, *_ = nbests[0]
    return text


def recognize_mircophone_input(recognizer: "sr.Recognizer") -> str:
    audio = input_from_microphone(recognizer)
    array = convert_to_array(audio)
    return recognize_speech(array)


def process_text(sentence: str) -> str:
    return sentence


dnntts_engine = DNNTTS()


def say(sentence: str):
    audio_array, sampling_rate = dnntts_engine.tts(sentence)
    sd.play(audio_array, sampling_rate)
    sd.wait()


if __name__ == "__main__":
    r = sr.Recognizer()

    while True:
        text = recognize_mircophone_input(r)
        response = process_text(text)
        say(response)

        character = input("ここで終了する場合はq、続ける場合はEnterを押してください: ")
        if character.strip().lower() == "q":
            break

Implement Shion(詩音) from SingaBitofHarmony(讓我聽見愛的歌聲) with Python¶

你好❗️ PyCon APAC 2022¶

About nikkie myself¶

About nikkie myself¶

Also talk "Revisit Python from statements and PEG"¶

Sing a Bit of Harmony¶

Sing a Bit of Harmony¶

Shion says "I will make you happy!"¶

Want to implement Shion!!¶

Implement Shion(詩音) from SingaBitofHarmony(讓我聽見愛的歌聲) with Python¶

Caveats⚠️¶

Implement Shion with Python¶

Define Shion v0.0.1¶

Demo: Shion v0.0.1¶

Organize technical requirements¶

Definition of Shion v0.0.1¶

Technical requirements¶

Validate then refine¶

Technology to read text out loud¶

TTS (Text-To-Speech) in this talk¶

Technology to convert voice to text¶

ASR (Automatic Speech Recognition) in this talk¶

Technology line-up in this talk¶

TTS first move: call OS command¶

Shion v0.0.1 at this step¶

TTS command¶

say command in macOS¶

say -v ?: obtain a list of voices¶

Call say command from Python¶

subprocess.run¶

TTS with subprocess.run¶

TTS sample script¶

Technology line-up in this talk¶

ASR first move: Call Web API¶

Shion v0.0.1 at this step¶

ASR Web APIs¶

SpeechRecognition¶

Process with SpeechRecognition¶

1.Get audio from a microphone¶

2.Send audio to ASR Web API¶

2.Send audio to ASR Web API (cont.)¶

ASR sample script¶

Process text¶

Shion v0.0.1 at this step¶

parroting the text🦜 (this time)¶

Place validating ideas quickly above everything¶

Validation result¶

Room for refinement of quick implementation 1/2¶

Room for refinement of quick implementation 2/2¶

Make it more like Shion!¶

Technology line-up in this talk¶

TTS refinement: use pre-trained model¶

Shion v0.0.1 at this step¶

ttslearn¶

Example of speech synthesis in Japanese¶

DNNTTS()¶

sounddevice¶

Example of TTS¶

Refined TTS sample script¶

Technology line-up in this talk¶

ASR refinement: use pre-trained model¶

Shion v0.0.1 at this step¶

ESPnet¶

Use pre-trained model in ESPnet¶

Example for using pre-trained model¶

Refine ASR feature with pre-trained model in Espnet¶

First step: ASR of WAV file¶

SoundFile¶

tips: Create WAV file with say command¶

ASR of WAV file¶

ASR of voice input from microphone¶

Handle microphone: SpeechRecognition again¶

Get bytes of WAV format¶

Convert to NumPy array¶

ASR of array¶

Refined ASR sample script¶

Shion v0.0.1 refined!¶

shion.py: the integration¶

shion.py¶

Share what I'm learning in implementing Shion with Python¶

`say` command in macOS¶

`say -v ?`: obtain a list of voices¶

Call `say` command from Python¶

`subprocess.run`¶

TTS with `subprocess.run`¶

`SpeechRecognition`¶

Process with `SpeechRecognition`¶

`ttslearn`¶

`DNNTTS()`¶

`sounddevice`¶

`ESPnet`¶

`SoundFile`¶

tips: Create WAV file with `say` command¶

Handle microphone: `SpeechRecognition` again¶