import logging from typing import Iterator import numpy as np log = logging.getLogger(__name__) DEFAULT_VOICE = "af_heart" DEFAULT_LANG = "a" # American English class TTSEngine: """Wraps Kokoro TTS for fast streaming text-to-speech.""" def __init__(self): from kokoro import KPipeline self.pipeline = KPipeline(lang_code=DEFAULT_LANG) self.voice = DEFAULT_VOICE self.sample_rate = 24000 def set_voice(self, voice: str, lang_code: str = "a"): """Change the voice.""" from kokoro import KPipeline self.voice = voice self.pipeline = KPipeline(lang_code=lang_code) log.info(f"Voice set to: {voice} (lang: {lang_code})") def synthesize_stream(self, text: str) -> Iterator[np.ndarray]: """Yield audio chunks as they are generated. Each chunk is a float32 numpy array at self.sample_rate (24kHz). Kokoro internally splits text into sentences and yields per-sentence audio. """ for _gs, _ps, audio in self.pipeline(text, voice=self.voice): if audio is not None and len(audio) > 0: yield audio