live-voice-chat/server/tts.py

import logging
from typing import Iterator

import numpy as np

log = logging.getLogger(__name__)

DEFAULT_VOICE = "af_heart"
DEFAULT_LANG = "a"  # American English


class TTSEngine:
    """Wraps Kokoro TTS for fast streaming text-to-speech."""

    def __init__(self):
        from kokoro import KPipeline

        self.pipeline = KPipeline(lang_code=DEFAULT_LANG)
        self.voice = DEFAULT_VOICE
        self.sample_rate = 24000

    def set_voice(self, voice: str, lang_code: str = "a"):
        """Change the voice."""
        from kokoro import KPipeline

        self.voice = voice
        self.pipeline = KPipeline(lang_code=lang_code)
        log.info(f"Voice set to: {voice} (lang: {lang_code})")

    def synthesize_stream(self, text: str) -> Iterator[np.ndarray]:
        """Yield audio chunks as they are generated.

        Each chunk is a float32 numpy array at self.sample_rate (24kHz).
        Kokoro internally splits text into sentences and yields per-sentence audio.
        """
        for _gs, _ps, audio in self.pipeline(text, voice=self.voice):
            if audio is not None and len(audio) > 0:
                yield audio