first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
View File
+72
View File
@@ -0,0 +1,72 @@
"""Shared utilities for component tests.
Component tests run inside the Docker image against real GPU models. They
write their output artefacts (MP4s, PNGs, logs) to ``_out/`` so you can
visually inspect results.
"""
from __future__ import annotations
import logging
import os
import sys
import numpy as np
OUT_DIR = os.path.join(os.path.dirname(__file__), "_out")
os.makedirs(OUT_DIR, exist_ok=True)
# A tiny 256x256 portrait PNG lives next to the component tests so tests
# don't need a user-supplied file. If it's missing we synthesise one on
# the fly.
SAMPLE_AVATAR = os.path.join(os.path.dirname(__file__), "sample_avatar.png")
def get_logger(name: str) -> logging.Logger:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s %(message)s",
stream=sys.stdout,
)
return logging.getLogger(name)
def ensure_sample_avatar() -> str:
"""Guarantee a usable avatar image exists. Returns its path."""
if os.path.isfile(SAMPLE_AVATAR):
return SAMPLE_AVATAR
# Synthesise a simple gradient PNG as a last resort (won't look like a
# person but is valid input for Wan2.2 so the pipeline doesn't fail).
try:
from PIL import Image # type: ignore[import-not-found]
except ImportError:
import imageio.v3 as iio # type: ignore[import-not-found]
arr = np.zeros((256, 256, 3), dtype=np.uint8)
for y in range(256):
arr[y, :, 0] = y
arr[y, :, 1] = 255 - y
arr[y, :, 2] = 128
iio.imwrite(SAMPLE_AVATAR, arr)
return SAMPLE_AVATAR
arr = np.zeros((256, 256, 3), dtype=np.uint8)
for y in range(256):
arr[y, :, 0] = y
arr[y, :, 1] = 255 - y
arr[y, :, 2] = 128
Image.fromarray(arr).save(SAMPLE_AVATAR)
return SAMPLE_AVATAR
def write_bytes(name: str, data: bytes) -> str:
"""Write an artefact to _out/<name> and return the full path."""
path = os.path.join(OUT_DIR, name)
with open(path, "wb") as f:
f.write(data)
return path
def synth_tone(seconds: float, sample_rate: int = 24000, freq: float = 220.0) -> np.ndarray:
"""Return a float32 sine tone usable as stand-in TTS audio."""
t = np.arange(int(seconds * sample_rate), dtype=np.float32) / sample_rate
return (0.2 * np.sin(2 * np.pi * freq * t)).astype(np.float32)
+46
View File
@@ -0,0 +1,46 @@
"""Run every component test in order. Stops at first failure.
docker compose exec voice-chat python -m tests.component.run_all
"""
import importlib
import sys
import traceback
SCRIPTS = [
"tests.component.test_01_video_skeleton",
"tests.component.test_02_wan22_loras",
"tests.component.test_03_idle_clip",
"tests.component.test_04_library_prebake",
"tests.component.test_05_musetalk_lipsync",
"tests.component.test_06_reflective",
"tests.component.test_07_endpoints",
"tests.component.test_08_lora_reload",
]
def main() -> int:
failed: list[str] = []
for name in SCRIPTS:
print(f"\n{'=' * 70}\nRUNNING: {name}\n{'=' * 70}")
try:
mod = importlib.import_module(name)
mod.run()
except SystemExit as e:
if e.code:
print(f"FAILED: {name} (exit {e.code})")
failed.append(name)
break # hard-stop on failure
except Exception:
traceback.print_exc()
failed.append(name)
break
if failed:
print(f"\n{len(failed)} failed: {failed}")
return 1
print("\nALL COMPONENT TESTS PASSED")
return 0
if __name__ == "__main__":
sys.exit(main())
+69
View File
@@ -0,0 +1,69 @@
"""Phase 1 component test: VideoEngine skeleton + config gate.
Verifies:
- ``ModelManager`` can be imported and constructed.
- When ``config.video.enabled=false``, ``_load_video`` skips and leaves
``video_engine=None`` (existing voice path unaffected).
- When ``config.video.enabled=true``, a ``VideoEngine`` instance is created
and ``is_ready()`` returns False (no models loaded yet).
Does NOT load Wan2.2 or MuseTalk — this test is safe to run on any machine
with the python deps installed (no GPU needed).
Run inside Docker:
docker compose exec voice-chat python -m tests.component.test_01_video_skeleton
"""
from __future__ import annotations
import sys
from server.models import ModelManager
from server.video import VideoConfig, VideoEngine
from tests.component._common import get_logger
log = get_logger("test_01")
def run():
# --- disabled path ---
log.info("[case 1] config.video.enabled=False → engine skipped")
mgr = ModelManager()
# Monkey-patch the config module to simulate disabled
import server.config as cfgmod
original = cfgmod.config
cfgmod.config = {"video": {"enabled": False}, **{k: v for k, v in original.items() if k != "video"}}
try:
mgr._load_video()
assert mgr.video_engine is None, "video_engine should be None when disabled"
log.info(" PASS: video_engine is None")
finally:
cfgmod.config = original
# --- enabled path (no models loaded) ---
log.info("[case 2] config.video.enabled=True → engine created, not ready")
mgr2 = ModelManager()
cfgmod.config = {
**original,
"video": {"enabled": True, "mode": "reflective", "loras": []},
}
try:
mgr2._load_video()
assert mgr2.video_engine is not None, "video_engine should be created"
assert isinstance(mgr2.video_engine, VideoEngine)
assert mgr2.video_engine.is_ready() is False
log.info(" PASS: engine=%s, ready=%s",
type(mgr2.video_engine).__name__, mgr2.video_engine.is_ready())
finally:
cfgmod.config = original
log.info("ALL PASSED")
if __name__ == "__main__":
try:
run()
sys.exit(0)
except AssertionError as e:
log.error("FAILED: %s", e)
sys.exit(1)
+106
View File
@@ -0,0 +1,106 @@
"""Phase 2 component test: Wan2.2-Lightning fp8 pipeline + LoRA stacking.
Verifies:
- ``Wan22Pipeline`` loads successfully against the fp8 distill path
(exercises the real LightX2V set_config → init_runner flow).
- ``load_loras`` / ``unload_loras`` survive with the two user LoRAs at
``/cache/loras/wan22-[HL]-e8.safetensors``.
Requires GPU and a first-run download of both HF repos (base support files
~12 GB, fp8 DIT ~30 GB). If LightX2V isn't installed the test is skipped.
Run:
docker compose exec voice-chat python -m tests.component.test_02_wan22_loras
"""
from __future__ import annotations
import os
import sys
from tests.component._common import get_logger
log = get_logger("test_02")
CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
LORA_HIGH = "/cache/loras/wan22-H-e8.safetensors"
LORA_LOW = "/cache/loras/wan22-L-e8.safetensors"
def run():
try:
from server.video_models.wan22 import Wan22Pipeline
except ImportError as e:
log.error("Wan22Pipeline import failed: %s", e)
log.warning("SKIP: phase 2 deps not installed")
sys.exit(0)
from server.video import LoRASpec
log.info("[case 1] Instantiate Wan22Pipeline "
"(first run downloads ~42 GB total)...")
try:
pipe = Wan22Pipeline(
base_repo="Wan-AI/Wan2.2-I2V-A14B",
fp8_repo="lightx2v/Wan2.2-Distill-Models",
config_json=CONFIG_JSON,
model_cls="wan2.2_moe_distill",
resolution=480,
fps=16,
)
except Exception as e:
log.error("FAIL: Wan22Pipeline construction raised: %s", e)
log.error("Check: LightX2V install, HF cache at /cache/huggingface, "
"VRAM headroom, and that %s exists inside the container.",
CONFIG_JSON)
sys.exit(2)
log.info(" PASS: pipeline constructed")
# --- LoRAs ---
log.info("[case 2] load_loras with empty list → no-op")
pipe.load_loras([])
log.info(" PASS")
if not (os.path.isfile(LORA_HIGH) and os.path.isfile(LORA_LOW)):
log.warning("SKIP: expected LoRA files not found at %s / %s",
LORA_HIGH, LORA_LOW)
log.info("ALL PASSED (partial — LoRA cases skipped)")
return
log.info("[case 3] load_loras with the two MoE distill LoRAs")
specs = [
LoRASpec(
path=LORA_HIGH,
weight=1.0,
target="high_noise",
name="wan22-H-e8",
),
LoRASpec(
path=LORA_LOW,
weight=1.0,
target="low_noise",
name="wan22-L-e8",
),
]
try:
pipe.load_loras(specs)
except Exception as e:
log.error("FAIL: load_loras raised: %s", e)
log.error("Check: switch_lora support for wan2.2_moe_distill in the "
"installed LightX2V build. If it errors there, pre-declare "
"LoRAs in the config_json 'lora_configs' field instead.")
sys.exit(3)
log.info(" PASS: LoRAs applied")
log.info("[case 4] unload_loras")
try:
pipe.unload_loras()
except Exception as e:
log.error("FAIL: unload_loras raised: %s", e)
sys.exit(4)
log.info(" PASS")
log.info("ALL PASSED")
if __name__ == "__main__":
run()
+66
View File
@@ -0,0 +1,66 @@
"""Phase 3 component test: avatar upload → idle clip generation.
Verifies:
- ``VideoEngine.load_models()`` + ``set_avatar(image)`` produces a non-empty
idle MP4 blob.
- The blob decodes as a valid MP4 (ftyp header).
Writes the idle clip to ``tests/component/_out/phase3_idle.mp4`` so you can
inspect it visually.
Run:
docker compose exec voice-chat python -m tests.component.test_03_idle_clip
"""
from __future__ import annotations
import sys
from server.video import VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger, write_bytes
log = get_logger("test_03")
def run():
avatar_path = ensure_sample_avatar()
log.info("Using avatar: %s", avatar_path)
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective", # reflective skips the library prebake
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 0, "base_clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
log.info("Loading models (Wan2.2 + MuseTalk)...")
try:
engine.load_models()
except Exception as e:
log.error("FAIL: load_models raised: %s", e)
sys.exit(2)
log.info("Models loaded.")
log.info("Generating idle clip for avatar...")
try:
engine.set_avatar(avatar_path)
except Exception as e:
log.error("FAIL: set_avatar raised: %s", e)
sys.exit(3)
idle = engine.get_idle_clip()
assert idle is not None and len(idle) > 0, "idle clip is empty"
assert idle[4:8] == b"ftyp", "idle clip is not a valid MP4"
out_path = write_bytes("phase3_idle.mp4", idle)
log.info("PASS: idle clip written to %s (%d bytes)", out_path, len(idle))
assert engine.is_ready() is True
log.info(" engine.is_ready() = True (avatar + models present)")
if __name__ == "__main__":
run()
@@ -0,0 +1,55 @@
"""Phase 4 component test: library mode pre-bake of speaking-base clips.
Verifies:
- ``set_avatar`` under ``mode=library`` populates ``speaking_base_frames``
with ``library_base_clip_count`` entries.
- Each cached entry has shape ``[T, H, W, 3]`` uint8.
Run:
docker compose exec voice-chat python -m tests.component.test_04_library_prebake
"""
from __future__ import annotations
import sys
import numpy as np
from server.video import VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger
log = get_logger("test_04")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "library",
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 2, "base_clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
log.info("Loading models...")
engine.load_models()
log.info("Pre-baking 2 library clips...")
engine.set_avatar(avatar_path)
assert len(engine.speaking_base_frames) == 2, \
f"expected 2 base clips, got {len(engine.speaking_base_frames)}"
for i, frames in enumerate(engine.speaking_base_frames):
assert isinstance(frames, np.ndarray)
assert frames.ndim == 4 and frames.shape[-1] == 3
assert frames.dtype == np.uint8
log.info(" clip %d: shape=%s", i, frames.shape)
assert engine.get_idle_clip() is not None
log.info("PASS: library pre-bake complete")
if __name__ == "__main__":
run()
@@ -0,0 +1,57 @@
"""Phase 5 component test: MuseTalk lip-sync + ffmpeg mux.
Verifies the full library-mode per-turn path:
- Pre-bake a library clip.
- Generate a stand-in TTS waveform (sine tone).
- Call ``VideoEngine.generate_speaking_clip`` and get a valid MP4 back.
Writes the resulting clip to ``tests/component/_out/phase5_speaking.mp4``.
Run:
docker compose exec voice-chat python -m tests.component.test_05_musetalk_lipsync
"""
from __future__ import annotations
import sys
from server.video import VideoConfig, VideoEngine
from tests.component._common import (
ensure_sample_avatar,
get_logger,
synth_tone,
write_bytes,
)
log = get_logger("test_05")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "library",
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 1, "base_clip_seconds": 4},
}
)
engine = VideoEngine(cfg)
engine.load_models()
engine.set_avatar(avatar_path)
audio = synth_tone(seconds=3.0, sample_rate=24000, freq=220.0)
log.info("Generating library-mode speaking clip (3s audio)...")
mp4 = engine.generate_speaking_clip(
audio_f32=audio,
sample_rate=24000,
reply_text="Hello, this is a lip-sync test.",
)
assert isinstance(mp4, bytes) and len(mp4) > 0
assert mp4[4:8] == b"ftyp"
out = write_bytes("phase5_speaking.mp4", mp4)
log.info("PASS: speaking clip written to %s (%d bytes)", out, len(mp4))
if __name__ == "__main__":
run()
+69
View File
@@ -0,0 +1,69 @@
"""Phase 6 component test: reflective mode (fresh Wan2.2 clip per turn).
Verifies that with ``mode=reflective``, ``generate_speaking_clip`` runs
the Wan2.2 image-to-video pipeline once per call (so the base frames
differ from turn to turn) and the prompt is derived from the reply text.
Run:
docker compose exec voice-chat python -m tests.component.test_06_reflective
"""
from __future__ import annotations
import numpy as np
from server.video import VideoConfig, VideoEngine
from tests.component._common import (
ensure_sample_avatar,
get_logger,
synth_tone,
write_bytes,
)
log = get_logger("test_06")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective",
"resolution": 480,
"fps": 16,
"reflective": {"clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
engine.load_models()
engine.set_avatar(avatar_path)
# Verify prompt derivation includes the reply hint
prompt = engine._derive_prompt(
"The assistant walks along a sunny beach watching seagulls."
)
log.info("derived prompt: %s", prompt)
assert "beach" in prompt, "reply_hint did not survive template interpolation"
audio = synth_tone(seconds=3.0)
log.info("Generating reflective speaking clip #1...")
mp4_a = engine.generate_speaking_clip(
audio, 24000, "The assistant walks along a sunny beach watching seagulls."
)
write_bytes("phase6_reflective_beach.mp4", mp4_a)
log.info("Generating reflective speaking clip #2...")
mp4_b = engine.generate_speaking_clip(
audio, 24000, "Now the character stands in a snow-covered forest at dusk."
)
write_bytes("phase6_reflective_snow.mp4", mp4_b)
# Not a strict assertion (same prompt could yield identical bytes if seeded),
# but with different prompts and random seeds the blobs should differ.
if mp4_a != mp4_b:
log.info("PASS: reflective clips differ as expected")
else:
log.warning("clips are byte-identical — check that seeds are random")
if __name__ == "__main__":
run()
+114
View File
@@ -0,0 +1,114 @@
"""Phase 7 component test: HTTP endpoints (/api/set-avatar, /api/idle-clip,
/api/set-video-mode, /api/reload-loras, WebSocket handshake video_mode msg).
Uses FastAPI's ``TestClient`` so we don't need a running uvicorn server.
Stubs the model manager to avoid loading Wan2.2 — we only care that the
HTTP surface is plumbed correctly.
Run:
docker compose exec voice-chat python -m tests.component.test_07_endpoints
"""
from __future__ import annotations
import io
import json
import sys
from tests.component._common import get_logger
log = get_logger("test_07")
def _stub_video_engine():
class StubCfg:
mode = "reflective"
class StubEngine:
cfg = StubCfg()
avatar_path = None
def __init__(self): self.idle = b"FAKE_MP4"
def is_ready(self): return bool(self.avatar_path)
def get_idle_clip(self): return self.idle
def set_avatar(self, path): self.avatar_path = path
def load_loras(self, specs): self._last_loras = specs
return StubEngine()
def run():
from fastapi.testclient import TestClient
import server.main as main_mod
# Inject a stub engine so we never touch Wan2.2.
main_mod.model_mgr.video_engine = _stub_video_engine()
# Bypass the heavy lifespan (model loading) so TestClient starts fast.
main_mod.app.router.lifespan_context = None # type: ignore[attr-defined]
client = TestClient(main_mod.app)
# --- set-avatar ---
log.info("[case 1] POST /api/set-avatar")
fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64 # minimal PNG header
resp = client.post(
"/api/set-avatar",
files={"image": ("avatar.png", io.BytesIO(fake_png), "image/png")},
)
assert resp.status_code == 200, f"got {resp.status_code}: {resp.text}"
data = resp.json()
assert data["status"] == "ok"
assert data["idle_clip_url"] == "/api/idle-clip"
log.info(" PASS: %s", data)
# --- idle-clip ---
log.info("[case 2] GET /api/idle-clip")
resp = client.get("/api/idle-clip")
assert resp.status_code == 200
assert resp.content == b"FAKE_MP4"
assert resp.headers["content-type"] == "video/mp4"
log.info(" PASS")
# --- set-video-mode ---
log.info("[case 3] POST /api/set-video-mode")
for mode in ("off", "library", "reflective"):
resp = client.post("/api/set-video-mode", data={"mode": mode})
assert resp.status_code == 200
assert resp.json()["mode"] == mode
resp = client.post("/api/set-video-mode", data={"mode": "bogus"})
assert resp.status_code == 400
log.info(" PASS")
# --- reload-loras ---
log.info("[case 4] POST /api/reload-loras")
body = {
"loras": [
{"path": "/cache/loras/a.safetensors", "weight": 0.8,
"target": "high_noise", "name": "test-a"},
{"path": "/cache/loras/b.safetensors", "weight": 0.4,
"target": "low_noise"},
]
}
resp = client.post("/api/reload-loras", json=body)
assert resp.status_code == 200, resp.text
data = resp.json()
assert data["lora_count"] == 2
log.info(" PASS: %s", data)
# --- WebSocket video_mode handshake ---
log.info("[case 5] WebSocket /ws/chat → video_mode announcement")
with client.websocket_connect("/ws/chat") as websocket:
msgs = []
for _ in range(5):
try:
msg = websocket.receive_json()
msgs.append(msg)
if msg.get("type") == "video_mode":
break
except Exception:
break
assert any(m.get("type") == "video_mode" for m in msgs), msgs
log.info(" PASS")
log.info("ALL PASSED")
if __name__ == "__main__":
run()
+60
View File
@@ -0,0 +1,60 @@
"""Phase 8 component test: /api/reload-loras hot-swap.
Verifies that ``VideoEngine.load_loras`` can be called again after startup
and the idle clip is regenerated to reflect the new style.
This test is the 'real model' version of test_07's reload endpoint stub.
Run:
docker compose exec voice-chat python -m tests.component.test_08_lora_reload
"""
from __future__ import annotations
import hashlib
from server.video import LoRASpec, VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger, write_bytes
log = get_logger("test_08")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict({"enabled": True, "mode": "reflective"})
engine = VideoEngine(cfg)
engine.load_models()
# Initial state: no LoRAs
engine.set_avatar(avatar_path)
idle_a = engine.get_idle_clip()
assert idle_a is not None
hash_a = hashlib.sha256(idle_a).hexdigest()
write_bytes("phase8_idle_noloras.mp4", idle_a)
log.info("idle (no LoRAs) sha256=%s", hash_a[:16])
# Hot-reload with a distill LoRA
specs = [
LoRASpec(
path="lightx2v/Wan2.2-Distill-Loras:"
"wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step.safetensors",
weight=1.0,
target="high_noise",
name="distill-hi",
),
]
engine.load_loras(specs)
engine.set_avatar(avatar_path)
idle_b = engine.get_idle_clip()
assert idle_b is not None
hash_b = hashlib.sha256(idle_b).hexdigest()
write_bytes("phase8_idle_withlora.mp4", idle_b)
log.info("idle (with LoRA) sha256=%s", hash_b[:16])
if hash_a != hash_b:
log.info("PASS: idle clip changed after LoRA reload")
else:
log.warning("clips identical — LoRA may not be applied; eyeball _out/*.mp4")
if __name__ == "__main__":
run()