58 lines
1.6 KiB
Python
58 lines
1.6 KiB
Python
"""Phase 5 component test: MuseTalk lip-sync + ffmpeg mux.
|
|
|
|
Verifies the full library-mode per-turn path:
|
|
- Pre-bake a library clip.
|
|
- Generate a stand-in TTS waveform (sine tone).
|
|
- Call ``VideoEngine.generate_speaking_clip`` and get a valid MP4 back.
|
|
|
|
Writes the resulting clip to ``tests/component/_out/phase5_speaking.mp4``.
|
|
|
|
Run:
|
|
docker compose exec voice-chat python -m tests.component.test_05_musetalk_lipsync
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
|
|
from server.video import VideoConfig, VideoEngine
|
|
from tests.component._common import (
|
|
ensure_sample_avatar,
|
|
get_logger,
|
|
synth_tone,
|
|
write_bytes,
|
|
)
|
|
|
|
log = get_logger("test_05")
|
|
|
|
|
|
def run():
|
|
avatar_path = ensure_sample_avatar()
|
|
cfg = VideoConfig.from_dict(
|
|
{
|
|
"enabled": True,
|
|
"mode": "library",
|
|
"resolution": 480,
|
|
"fps": 16,
|
|
"library": {"base_clip_count": 1, "base_clip_seconds": 4},
|
|
}
|
|
)
|
|
engine = VideoEngine(cfg)
|
|
engine.load_models()
|
|
engine.set_avatar(avatar_path)
|
|
|
|
audio = synth_tone(seconds=3.0, sample_rate=24000, freq=220.0)
|
|
log.info("Generating library-mode speaking clip (3s audio)...")
|
|
mp4 = engine.generate_speaking_clip(
|
|
audio_f32=audio,
|
|
sample_rate=24000,
|
|
reply_text="Hello, this is a lip-sync test.",
|
|
)
|
|
assert isinstance(mp4, bytes) and len(mp4) > 0
|
|
assert mp4[4:8] == b"ftyp"
|
|
out = write_bytes("phase5_speaking.mp4", mp4)
|
|
log.info("PASS: speaking clip written to %s (%d bytes)", out, len(mp4))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|