"""Phase 5 component test: MuseTalk lip-sync + ffmpeg mux. Verifies the full library-mode per-turn path: - Pre-bake a library clip. - Generate a stand-in TTS waveform (sine tone). - Call ``VideoEngine.generate_speaking_clip`` and get a valid MP4 back. Writes the resulting clip to ``tests/component/_out/phase5_speaking.mp4``. Run: docker compose exec voice-chat python -m tests.component.test_05_musetalk_lipsync """ from __future__ import annotations import sys from server.video import VideoConfig, VideoEngine from tests.component._common import ( ensure_sample_avatar, get_logger, synth_tone, write_bytes, ) log = get_logger("test_05") def run(): avatar_path = ensure_sample_avatar() cfg = VideoConfig.from_dict( { "enabled": True, "mode": "library", "resolution": 480, "fps": 16, "library": {"base_clip_count": 1, "base_clip_seconds": 4}, } ) engine = VideoEngine(cfg) engine.load_models() engine.set_avatar(avatar_path) audio = synth_tone(seconds=3.0, sample_rate=24000, freq=220.0) log.info("Generating library-mode speaking clip (3s audio)...") mp4 = engine.generate_speaking_clip( audio_f32=audio, sample_rate=24000, reply_text="Hello, this is a lip-sync test.", ) assert isinstance(mp4, bytes) and len(mp4) > 0 assert mp4[4:8] == b"ftyp" out = write_bytes("phase5_speaking.mp4", mp4) log.info("PASS: speaking clip written to %s (%d bytes)", out, len(mp4)) if __name__ == "__main__": run()