anki/qt/aqt/tts.py
Damien Elmes 66e277e44b more TTS and sound work
- use provided language and voices when playing on Mac
- fix hang in waiting for termination
- allow players to return a rank for a given tag,
which will allow for the best matching player to be chosen
depending on the context (eg, prioritize one player for videos,
one tts player for certain voices, etc)
2020-01-21 11:34:25 +10:00

135 lines
3.5 KiB
Python

"""
Basic text to speech support.
Users can use the following in their card template:
{{tts en_US:Field}}
or
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
The first argument must be a language code. If provided,
voices is a comma-separated list of one or more voices that
the user would prefer. Spaces must not be included.
Underscores will be converted to spaces.
"""
from __future__ import annotations
import re
import subprocess
from dataclasses import dataclass
from typing import List, Optional, cast
from anki.sound import AVTag, TTSTag
from aqt.sound import SimpleProcessPlayer
from aqt.taskman import TaskManager
@dataclass
class TTSArgs:
# requested language
lang: str
# preferred voices, will use first available if possible
voices: List[str]
@classmethod
def from_string(cls, args: List[str]) -> TTSArgs:
voices: Optional[List[str]] = None
lang = args[0]
for arg in args[1:]:
try:
key, val = arg.split("=")
except ValueError:
continue
key = key.strip()
val = val.strip().replace("_", " ")
if key == "voices":
voices = val.split(",")
return TTSArgs(voices=voices or [], lang=lang)
# Mac support
##########################################################################
@dataclass
class MacVoice:
name: str
lang: str
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
def parse_voice_line(line: str) -> Optional[MacVoice]:
m = VOICE_HELP_LINE_RE.match(line)
if not m:
return None
return MacVoice(name=m.group(1), lang=m.group(2))
class MacTTSPlayer(SimpleProcessPlayer):
def __init__(self, taskman: TaskManager):
super().__init__(taskman)
self._available_voices: Optional[List[MacVoice]] = None
def _play(self, tag: AVTag) -> None:
ttag = cast(TTSTag, tag)
voice = self.voice_for_tag(ttag)
self._process = subprocess.Popen(
["say", "-v", voice.name, "-f", "-"],
stdin=subprocess.PIPE,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
)
# write the input text to stdin
self._process.stdin.write(ttag.text.encode("utf8"))
self._process.stdin.close()
self._wait_for_termination()
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
if not isinstance(tag, TTSTag):
return None
# todo
return 0
def voices(self) -> List[MacVoice]:
if not self._available_voices:
cmd = subprocess.run(
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
)
self._available_voices = []
for line in cmd.stdout.splitlines():
voice = parse_voice_line(line)
if voice:
self._available_voices.append(voice)
return self._available_voices
def voice_for_tag(self, tag: TTSTag) -> MacVoice:
args = TTSArgs.from_string(tag.args)
voices = self.voices()
# any requested voices match?
for requested_voice in args.voices:
avail_voice = next((x for x in voices if x.name == requested_voice), None)
if avail_voice:
return avail_voice
# requested language match?
avail_voice = next((x for x in voices if x.lang == args.lang), None)
if avail_voice:
return avail_voice
# fall back on first voice
return voices[0]