live-voice-chat/tests/component/test_05_musetalk_lipsync.py

"""Phase 5 component test: MuseTalk lip-sync + ffmpeg mux.

Verifies the full library-mode per-turn path:
- Pre-bake a library clip.
- Generate a stand-in TTS waveform (sine tone).
- Call ``VideoEngine.generate_speaking_clip`` and get a valid MP4 back.

Writes the resulting clip to ``tests/component/_out/phase5_speaking.mp4``.

Run:
    docker compose exec voice-chat python -m tests.component.test_05_musetalk_lipsync
"""
from __future__ import annotations

import sys

from server.video import VideoConfig, VideoEngine
from tests.component._common import (
    ensure_sample_avatar,
    get_logger,
    synth_tone,
    write_bytes,
)

log = get_logger("test_05")


def run():
    avatar_path = ensure_sample_avatar()
    cfg = VideoConfig.from_dict(
        {
            "enabled": True,
            "mode": "library",
            "resolution": 480,
            "fps": 16,
            "library": {"base_clip_count": 1, "base_clip_seconds": 4},
        }
    )
    engine = VideoEngine(cfg)
    engine.load_models()
    engine.set_avatar(avatar_path)

    audio = synth_tone(seconds=3.0, sample_rate=24000, freq=220.0)
    log.info("Generating library-mode speaking clip (3s audio)...")
    mp4 = engine.generate_speaking_clip(
        audio_f32=audio,
        sample_rate=24000,
        reply_text="Hello, this is a lip-sync test.",
    )
    assert isinstance(mp4, bytes) and len(mp4) > 0
    assert mp4[4:8] == b"ftyp"
    out = write_bytes("phase5_speaking.mp4", mp4)
    log.info("PASS: speaking clip written to %s (%d bytes)", out, len(mp4))


if __name__ == "__main__":
    run()