first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
+47
View File
@@ -0,0 +1,47 @@
# Voice-chat tests
Two tiers.
## Unit tests — fast, GPU-free
```
python -m pytest tests/unit -v
```
These exercise pure logic: config parsing, prompt derivation, LoRA spec
parsing, frame-length fitting, library round-robin selection. They do not
touch CUDA, Wan2.2, MuseTalk, or ffmpeg. Safe to run on Windows, outside
Docker, without any models installed.
## Component tests — slow, GPU-required, run inside Docker
Each script in `tests/component/` exercises one subsystem end-to-end against
the real models. They are ordered to match the implementation phases:
| Script | Phase | Tests |
|---|---|---|
| `test_01_video_skeleton.py` | 1 | VideoEngine loads, config gate respected |
| `test_02_wan22_loras.py` | 2 | Wan2.2 pipeline loads, LoRA stack applies |
| `test_03_idle_clip.py` | 3 | set_avatar → idle MP4, written to disk for eyeballing |
| `test_04_library_prebake.py` | 4 | library mode pre-bakes N base clips |
| `test_05_musetalk_lipsync.py` | 5 | MuseTalk lip-sync on library frames + ffmpeg mux |
| `test_06_reflective.py` | 6 | reflective mode: fresh Wan2.2 per reply |
| `test_07_endpoints.py` | 7 | HTTP endpoints return sane responses |
| `test_08_lora_reload.py` | 8 | /api/reload-loras swaps LoRAs live |
Run one:
```
# Inside the container:
docker compose exec voice-chat python -m tests.component.test_03_idle_clip
```
Run all (slow, ~20+ minutes on 5090):
```
docker compose exec voice-chat python -m tests.component.run_all
```
Each component script writes its artifacts (MP4s, PNG frame dumps, logs)
to `tests/component/_out/` so you can visually inspect results. That
directory is gitignored.
View File
View File
+72
View File
@@ -0,0 +1,72 @@
"""Shared utilities for component tests.
Component tests run inside the Docker image against real GPU models. They
write their output artefacts (MP4s, PNGs, logs) to ``_out/`` so you can
visually inspect results.
"""
from __future__ import annotations
import logging
import os
import sys
import numpy as np
OUT_DIR = os.path.join(os.path.dirname(__file__), "_out")
os.makedirs(OUT_DIR, exist_ok=True)
# A tiny 256x256 portrait PNG lives next to the component tests so tests
# don't need a user-supplied file. If it's missing we synthesise one on
# the fly.
SAMPLE_AVATAR = os.path.join(os.path.dirname(__file__), "sample_avatar.png")
def get_logger(name: str) -> logging.Logger:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(name)s %(levelname)s %(message)s",
stream=sys.stdout,
)
return logging.getLogger(name)
def ensure_sample_avatar() -> str:
"""Guarantee a usable avatar image exists. Returns its path."""
if os.path.isfile(SAMPLE_AVATAR):
return SAMPLE_AVATAR
# Synthesise a simple gradient PNG as a last resort (won't look like a
# person but is valid input for Wan2.2 so the pipeline doesn't fail).
try:
from PIL import Image # type: ignore[import-not-found]
except ImportError:
import imageio.v3 as iio # type: ignore[import-not-found]
arr = np.zeros((256, 256, 3), dtype=np.uint8)
for y in range(256):
arr[y, :, 0] = y
arr[y, :, 1] = 255 - y
arr[y, :, 2] = 128
iio.imwrite(SAMPLE_AVATAR, arr)
return SAMPLE_AVATAR
arr = np.zeros((256, 256, 3), dtype=np.uint8)
for y in range(256):
arr[y, :, 0] = y
arr[y, :, 1] = 255 - y
arr[y, :, 2] = 128
Image.fromarray(arr).save(SAMPLE_AVATAR)
return SAMPLE_AVATAR
def write_bytes(name: str, data: bytes) -> str:
"""Write an artefact to _out/<name> and return the full path."""
path = os.path.join(OUT_DIR, name)
with open(path, "wb") as f:
f.write(data)
return path
def synth_tone(seconds: float, sample_rate: int = 24000, freq: float = 220.0) -> np.ndarray:
"""Return a float32 sine tone usable as stand-in TTS audio."""
t = np.arange(int(seconds * sample_rate), dtype=np.float32) / sample_rate
return (0.2 * np.sin(2 * np.pi * freq * t)).astype(np.float32)
+46
View File
@@ -0,0 +1,46 @@
"""Run every component test in order. Stops at first failure.
docker compose exec voice-chat python -m tests.component.run_all
"""
import importlib
import sys
import traceback
SCRIPTS = [
"tests.component.test_01_video_skeleton",
"tests.component.test_02_wan22_loras",
"tests.component.test_03_idle_clip",
"tests.component.test_04_library_prebake",
"tests.component.test_05_musetalk_lipsync",
"tests.component.test_06_reflective",
"tests.component.test_07_endpoints",
"tests.component.test_08_lora_reload",
]
def main() -> int:
failed: list[str] = []
for name in SCRIPTS:
print(f"\n{'=' * 70}\nRUNNING: {name}\n{'=' * 70}")
try:
mod = importlib.import_module(name)
mod.run()
except SystemExit as e:
if e.code:
print(f"FAILED: {name} (exit {e.code})")
failed.append(name)
break # hard-stop on failure
except Exception:
traceback.print_exc()
failed.append(name)
break
if failed:
print(f"\n{len(failed)} failed: {failed}")
return 1
print("\nALL COMPONENT TESTS PASSED")
return 0
if __name__ == "__main__":
sys.exit(main())
+69
View File
@@ -0,0 +1,69 @@
"""Phase 1 component test: VideoEngine skeleton + config gate.
Verifies:
- ``ModelManager`` can be imported and constructed.
- When ``config.video.enabled=false``, ``_load_video`` skips and leaves
``video_engine=None`` (existing voice path unaffected).
- When ``config.video.enabled=true``, a ``VideoEngine`` instance is created
and ``is_ready()`` returns False (no models loaded yet).
Does NOT load Wan2.2 or MuseTalk — this test is safe to run on any machine
with the python deps installed (no GPU needed).
Run inside Docker:
docker compose exec voice-chat python -m tests.component.test_01_video_skeleton
"""
from __future__ import annotations
import sys
from server.models import ModelManager
from server.video import VideoConfig, VideoEngine
from tests.component._common import get_logger
log = get_logger("test_01")
def run():
# --- disabled path ---
log.info("[case 1] config.video.enabled=False → engine skipped")
mgr = ModelManager()
# Monkey-patch the config module to simulate disabled
import server.config as cfgmod
original = cfgmod.config
cfgmod.config = {"video": {"enabled": False}, **{k: v for k, v in original.items() if k != "video"}}
try:
mgr._load_video()
assert mgr.video_engine is None, "video_engine should be None when disabled"
log.info(" PASS: video_engine is None")
finally:
cfgmod.config = original
# --- enabled path (no models loaded) ---
log.info("[case 2] config.video.enabled=True → engine created, not ready")
mgr2 = ModelManager()
cfgmod.config = {
**original,
"video": {"enabled": True, "mode": "reflective", "loras": []},
}
try:
mgr2._load_video()
assert mgr2.video_engine is not None, "video_engine should be created"
assert isinstance(mgr2.video_engine, VideoEngine)
assert mgr2.video_engine.is_ready() is False
log.info(" PASS: engine=%s, ready=%s",
type(mgr2.video_engine).__name__, mgr2.video_engine.is_ready())
finally:
cfgmod.config = original
log.info("ALL PASSED")
if __name__ == "__main__":
try:
run()
sys.exit(0)
except AssertionError as e:
log.error("FAILED: %s", e)
sys.exit(1)
+106
View File
@@ -0,0 +1,106 @@
"""Phase 2 component test: Wan2.2-Lightning fp8 pipeline + LoRA stacking.
Verifies:
- ``Wan22Pipeline`` loads successfully against the fp8 distill path
(exercises the real LightX2V set_config → init_runner flow).
- ``load_loras`` / ``unload_loras`` survive with the two user LoRAs at
``/cache/loras/wan22-[HL]-e8.safetensors``.
Requires GPU and a first-run download of both HF repos (base support files
~12 GB, fp8 DIT ~30 GB). If LightX2V isn't installed the test is skipped.
Run:
docker compose exec voice-chat python -m tests.component.test_02_wan22_loras
"""
from __future__ import annotations
import os
import sys
from tests.component._common import get_logger
log = get_logger("test_02")
CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
LORA_HIGH = "/cache/loras/wan22-H-e8.safetensors"
LORA_LOW = "/cache/loras/wan22-L-e8.safetensors"
def run():
try:
from server.video_models.wan22 import Wan22Pipeline
except ImportError as e:
log.error("Wan22Pipeline import failed: %s", e)
log.warning("SKIP: phase 2 deps not installed")
sys.exit(0)
from server.video import LoRASpec
log.info("[case 1] Instantiate Wan22Pipeline "
"(first run downloads ~42 GB total)...")
try:
pipe = Wan22Pipeline(
base_repo="Wan-AI/Wan2.2-I2V-A14B",
fp8_repo="lightx2v/Wan2.2-Distill-Models",
config_json=CONFIG_JSON,
model_cls="wan2.2_moe_distill",
resolution=480,
fps=16,
)
except Exception as e:
log.error("FAIL: Wan22Pipeline construction raised: %s", e)
log.error("Check: LightX2V install, HF cache at /cache/huggingface, "
"VRAM headroom, and that %s exists inside the container.",
CONFIG_JSON)
sys.exit(2)
log.info(" PASS: pipeline constructed")
# --- LoRAs ---
log.info("[case 2] load_loras with empty list → no-op")
pipe.load_loras([])
log.info(" PASS")
if not (os.path.isfile(LORA_HIGH) and os.path.isfile(LORA_LOW)):
log.warning("SKIP: expected LoRA files not found at %s / %s",
LORA_HIGH, LORA_LOW)
log.info("ALL PASSED (partial — LoRA cases skipped)")
return
log.info("[case 3] load_loras with the two MoE distill LoRAs")
specs = [
LoRASpec(
path=LORA_HIGH,
weight=1.0,
target="high_noise",
name="wan22-H-e8",
),
LoRASpec(
path=LORA_LOW,
weight=1.0,
target="low_noise",
name="wan22-L-e8",
),
]
try:
pipe.load_loras(specs)
except Exception as e:
log.error("FAIL: load_loras raised: %s", e)
log.error("Check: switch_lora support for wan2.2_moe_distill in the "
"installed LightX2V build. If it errors there, pre-declare "
"LoRAs in the config_json 'lora_configs' field instead.")
sys.exit(3)
log.info(" PASS: LoRAs applied")
log.info("[case 4] unload_loras")
try:
pipe.unload_loras()
except Exception as e:
log.error("FAIL: unload_loras raised: %s", e)
sys.exit(4)
log.info(" PASS")
log.info("ALL PASSED")
if __name__ == "__main__":
run()
+66
View File
@@ -0,0 +1,66 @@
"""Phase 3 component test: avatar upload → idle clip generation.
Verifies:
- ``VideoEngine.load_models()`` + ``set_avatar(image)`` produces a non-empty
idle MP4 blob.
- The blob decodes as a valid MP4 (ftyp header).
Writes the idle clip to ``tests/component/_out/phase3_idle.mp4`` so you can
inspect it visually.
Run:
docker compose exec voice-chat python -m tests.component.test_03_idle_clip
"""
from __future__ import annotations
import sys
from server.video import VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger, write_bytes
log = get_logger("test_03")
def run():
avatar_path = ensure_sample_avatar()
log.info("Using avatar: %s", avatar_path)
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective", # reflective skips the library prebake
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 0, "base_clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
log.info("Loading models (Wan2.2 + MuseTalk)...")
try:
engine.load_models()
except Exception as e:
log.error("FAIL: load_models raised: %s", e)
sys.exit(2)
log.info("Models loaded.")
log.info("Generating idle clip for avatar...")
try:
engine.set_avatar(avatar_path)
except Exception as e:
log.error("FAIL: set_avatar raised: %s", e)
sys.exit(3)
idle = engine.get_idle_clip()
assert idle is not None and len(idle) > 0, "idle clip is empty"
assert idle[4:8] == b"ftyp", "idle clip is not a valid MP4"
out_path = write_bytes("phase3_idle.mp4", idle)
log.info("PASS: idle clip written to %s (%d bytes)", out_path, len(idle))
assert engine.is_ready() is True
log.info(" engine.is_ready() = True (avatar + models present)")
if __name__ == "__main__":
run()
@@ -0,0 +1,55 @@
"""Phase 4 component test: library mode pre-bake of speaking-base clips.
Verifies:
- ``set_avatar`` under ``mode=library`` populates ``speaking_base_frames``
with ``library_base_clip_count`` entries.
- Each cached entry has shape ``[T, H, W, 3]`` uint8.
Run:
docker compose exec voice-chat python -m tests.component.test_04_library_prebake
"""
from __future__ import annotations
import sys
import numpy as np
from server.video import VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger
log = get_logger("test_04")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "library",
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 2, "base_clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
log.info("Loading models...")
engine.load_models()
log.info("Pre-baking 2 library clips...")
engine.set_avatar(avatar_path)
assert len(engine.speaking_base_frames) == 2, \
f"expected 2 base clips, got {len(engine.speaking_base_frames)}"
for i, frames in enumerate(engine.speaking_base_frames):
assert isinstance(frames, np.ndarray)
assert frames.ndim == 4 and frames.shape[-1] == 3
assert frames.dtype == np.uint8
log.info(" clip %d: shape=%s", i, frames.shape)
assert engine.get_idle_clip() is not None
log.info("PASS: library pre-bake complete")
if __name__ == "__main__":
run()
@@ -0,0 +1,57 @@
"""Phase 5 component test: MuseTalk lip-sync + ffmpeg mux.
Verifies the full library-mode per-turn path:
- Pre-bake a library clip.
- Generate a stand-in TTS waveform (sine tone).
- Call ``VideoEngine.generate_speaking_clip`` and get a valid MP4 back.
Writes the resulting clip to ``tests/component/_out/phase5_speaking.mp4``.
Run:
docker compose exec voice-chat python -m tests.component.test_05_musetalk_lipsync
"""
from __future__ import annotations
import sys
from server.video import VideoConfig, VideoEngine
from tests.component._common import (
ensure_sample_avatar,
get_logger,
synth_tone,
write_bytes,
)
log = get_logger("test_05")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "library",
"resolution": 480,
"fps": 16,
"library": {"base_clip_count": 1, "base_clip_seconds": 4},
}
)
engine = VideoEngine(cfg)
engine.load_models()
engine.set_avatar(avatar_path)
audio = synth_tone(seconds=3.0, sample_rate=24000, freq=220.0)
log.info("Generating library-mode speaking clip (3s audio)...")
mp4 = engine.generate_speaking_clip(
audio_f32=audio,
sample_rate=24000,
reply_text="Hello, this is a lip-sync test.",
)
assert isinstance(mp4, bytes) and len(mp4) > 0
assert mp4[4:8] == b"ftyp"
out = write_bytes("phase5_speaking.mp4", mp4)
log.info("PASS: speaking clip written to %s (%d bytes)", out, len(mp4))
if __name__ == "__main__":
run()
+69
View File
@@ -0,0 +1,69 @@
"""Phase 6 component test: reflective mode (fresh Wan2.2 clip per turn).
Verifies that with ``mode=reflective``, ``generate_speaking_clip`` runs
the Wan2.2 image-to-video pipeline once per call (so the base frames
differ from turn to turn) and the prompt is derived from the reply text.
Run:
docker compose exec voice-chat python -m tests.component.test_06_reflective
"""
from __future__ import annotations
import numpy as np
from server.video import VideoConfig, VideoEngine
from tests.component._common import (
ensure_sample_avatar,
get_logger,
synth_tone,
write_bytes,
)
log = get_logger("test_06")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective",
"resolution": 480,
"fps": 16,
"reflective": {"clip_seconds": 3},
}
)
engine = VideoEngine(cfg)
engine.load_models()
engine.set_avatar(avatar_path)
# Verify prompt derivation includes the reply hint
prompt = engine._derive_prompt(
"The assistant walks along a sunny beach watching seagulls."
)
log.info("derived prompt: %s", prompt)
assert "beach" in prompt, "reply_hint did not survive template interpolation"
audio = synth_tone(seconds=3.0)
log.info("Generating reflective speaking clip #1...")
mp4_a = engine.generate_speaking_clip(
audio, 24000, "The assistant walks along a sunny beach watching seagulls."
)
write_bytes("phase6_reflective_beach.mp4", mp4_a)
log.info("Generating reflective speaking clip #2...")
mp4_b = engine.generate_speaking_clip(
audio, 24000, "Now the character stands in a snow-covered forest at dusk."
)
write_bytes("phase6_reflective_snow.mp4", mp4_b)
# Not a strict assertion (same prompt could yield identical bytes if seeded),
# but with different prompts and random seeds the blobs should differ.
if mp4_a != mp4_b:
log.info("PASS: reflective clips differ as expected")
else:
log.warning("clips are byte-identical — check that seeds are random")
if __name__ == "__main__":
run()
+114
View File
@@ -0,0 +1,114 @@
"""Phase 7 component test: HTTP endpoints (/api/set-avatar, /api/idle-clip,
/api/set-video-mode, /api/reload-loras, WebSocket handshake video_mode msg).
Uses FastAPI's ``TestClient`` so we don't need a running uvicorn server.
Stubs the model manager to avoid loading Wan2.2 — we only care that the
HTTP surface is plumbed correctly.
Run:
docker compose exec voice-chat python -m tests.component.test_07_endpoints
"""
from __future__ import annotations
import io
import json
import sys
from tests.component._common import get_logger
log = get_logger("test_07")
def _stub_video_engine():
class StubCfg:
mode = "reflective"
class StubEngine:
cfg = StubCfg()
avatar_path = None
def __init__(self): self.idle = b"FAKE_MP4"
def is_ready(self): return bool(self.avatar_path)
def get_idle_clip(self): return self.idle
def set_avatar(self, path): self.avatar_path = path
def load_loras(self, specs): self._last_loras = specs
return StubEngine()
def run():
from fastapi.testclient import TestClient
import server.main as main_mod
# Inject a stub engine so we never touch Wan2.2.
main_mod.model_mgr.video_engine = _stub_video_engine()
# Bypass the heavy lifespan (model loading) so TestClient starts fast.
main_mod.app.router.lifespan_context = None # type: ignore[attr-defined]
client = TestClient(main_mod.app)
# --- set-avatar ---
log.info("[case 1] POST /api/set-avatar")
fake_png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 64 # minimal PNG header
resp = client.post(
"/api/set-avatar",
files={"image": ("avatar.png", io.BytesIO(fake_png), "image/png")},
)
assert resp.status_code == 200, f"got {resp.status_code}: {resp.text}"
data = resp.json()
assert data["status"] == "ok"
assert data["idle_clip_url"] == "/api/idle-clip"
log.info(" PASS: %s", data)
# --- idle-clip ---
log.info("[case 2] GET /api/idle-clip")
resp = client.get("/api/idle-clip")
assert resp.status_code == 200
assert resp.content == b"FAKE_MP4"
assert resp.headers["content-type"] == "video/mp4"
log.info(" PASS")
# --- set-video-mode ---
log.info("[case 3] POST /api/set-video-mode")
for mode in ("off", "library", "reflective"):
resp = client.post("/api/set-video-mode", data={"mode": mode})
assert resp.status_code == 200
assert resp.json()["mode"] == mode
resp = client.post("/api/set-video-mode", data={"mode": "bogus"})
assert resp.status_code == 400
log.info(" PASS")
# --- reload-loras ---
log.info("[case 4] POST /api/reload-loras")
body = {
"loras": [
{"path": "/cache/loras/a.safetensors", "weight": 0.8,
"target": "high_noise", "name": "test-a"},
{"path": "/cache/loras/b.safetensors", "weight": 0.4,
"target": "low_noise"},
]
}
resp = client.post("/api/reload-loras", json=body)
assert resp.status_code == 200, resp.text
data = resp.json()
assert data["lora_count"] == 2
log.info(" PASS: %s", data)
# --- WebSocket video_mode handshake ---
log.info("[case 5] WebSocket /ws/chat → video_mode announcement")
with client.websocket_connect("/ws/chat") as websocket:
msgs = []
for _ in range(5):
try:
msg = websocket.receive_json()
msgs.append(msg)
if msg.get("type") == "video_mode":
break
except Exception:
break
assert any(m.get("type") == "video_mode" for m in msgs), msgs
log.info(" PASS")
log.info("ALL PASSED")
if __name__ == "__main__":
run()
+60
View File
@@ -0,0 +1,60 @@
"""Phase 8 component test: /api/reload-loras hot-swap.
Verifies that ``VideoEngine.load_loras`` can be called again after startup
and the idle clip is regenerated to reflect the new style.
This test is the 'real model' version of test_07's reload endpoint stub.
Run:
docker compose exec voice-chat python -m tests.component.test_08_lora_reload
"""
from __future__ import annotations
import hashlib
from server.video import LoRASpec, VideoConfig, VideoEngine
from tests.component._common import ensure_sample_avatar, get_logger, write_bytes
log = get_logger("test_08")
def run():
avatar_path = ensure_sample_avatar()
cfg = VideoConfig.from_dict({"enabled": True, "mode": "reflective"})
engine = VideoEngine(cfg)
engine.load_models()
# Initial state: no LoRAs
engine.set_avatar(avatar_path)
idle_a = engine.get_idle_clip()
assert idle_a is not None
hash_a = hashlib.sha256(idle_a).hexdigest()
write_bytes("phase8_idle_noloras.mp4", idle_a)
log.info("idle (no LoRAs) sha256=%s", hash_a[:16])
# Hot-reload with a distill LoRA
specs = [
LoRASpec(
path="lightx2v/Wan2.2-Distill-Loras:"
"wan2.2_i2v_A14b_high_noise_lora_rank64_lightx2v_4step.safetensors",
weight=1.0,
target="high_noise",
name="distill-hi",
),
]
engine.load_loras(specs)
engine.set_avatar(avatar_path)
idle_b = engine.get_idle_clip()
assert idle_b is not None
hash_b = hashlib.sha256(idle_b).hexdigest()
write_bytes("phase8_idle_withlora.mp4", idle_b)
log.info("idle (with LoRA) sha256=%s", hash_b[:16])
if hash_a != hash_b:
log.info("PASS: idle clip changed after LoRA reload")
else:
log.warning("clips identical — LoRA may not be applied; eyeball _out/*.mp4")
if __name__ == "__main__":
run()
View File
+65
View File
@@ -0,0 +1,65 @@
"""Unit tests for the frame-length fitting helper in server.video_models.musetalk.
Pure-python: does not import MuseTalk itself.
"""
import numpy as np
from server.video_models.musetalk import _fit_frames_to_length, _ensure_uint8_rgb
def _make_frames(t, h=2, w=2):
return np.arange(t * h * w * 3, dtype=np.uint8).reshape(t, h, w, 3)
def test_fit_frames_trim():
frames = _make_frames(10)
out = _fit_frames_to_length(frames, 4)
assert out.shape == (4, 2, 2, 3)
np.testing.assert_array_equal(out, frames[:4])
def test_fit_frames_passthrough_when_equal():
frames = _make_frames(5)
out = _fit_frames_to_length(frames, 5)
assert out is frames or np.array_equal(out, frames)
def test_fit_frames_extends_with_pingpong():
frames = _make_frames(3)
out = _fit_frames_to_length(frames, 8)
assert out.shape == (8, 2, 2, 3)
# First 3 frames match the original
np.testing.assert_array_equal(out[:3], frames)
# Next 3 are the reverse (ping-pong)
np.testing.assert_array_equal(out[3:6], frames[::-1])
# Then forward again
np.testing.assert_array_equal(out[6:8], frames[:2])
def test_fit_frames_zero_target_returns_original():
frames = _make_frames(3)
out = _fit_frames_to_length(frames, 0)
np.testing.assert_array_equal(out, frames)
def test_ensure_uint8_rgb_from_float():
arr = np.ones((5, 2, 2, 3), dtype=np.float32) * 0.5
out = _ensure_uint8_rgb(arr)
assert out.dtype == np.uint8
assert out.shape == (5, 2, 2, 3)
assert out[0, 0, 0, 0] == 127
def test_ensure_uint8_rgb_promotes_3d_to_4d():
arr = np.zeros((2, 2, 3), dtype=np.uint8)
out = _ensure_uint8_rgb(arr)
assert out.shape == (1, 2, 2, 3)
def test_ensure_uint8_rgb_clips_float_out_of_range():
arr = np.ones((1, 1, 1, 3), dtype=np.float32) * 2.0 # 2.0 → clipped to 255
out = _ensure_uint8_rgb(arr)
assert out[0, 0, 0, 0] == 255
arr2 = np.ones((1, 1, 1, 3), dtype=np.float32) * -1.0
out2 = _ensure_uint8_rgb(arr2)
assert out2[0, 0, 0, 0] == 0
+67
View File
@@ -0,0 +1,67 @@
"""Unit tests for the ffmpeg muxer.
Requires ``ffmpeg`` on PATH. On Windows, if ffmpeg is not installed these
tests are skipped (they will run inside the Docker image where ffmpeg is
always present).
"""
import os
import shutil
import struct
import numpy as np
import pytest
from server.video_models.muxer import frames_and_audio_to_mp4, frames_to_mp4_loop
pytestmark = pytest.mark.skipif(
shutil.which("ffmpeg") is None,
reason="ffmpeg not installed locally; run these inside Docker",
)
def _rgb_frames(t, h=64, w=64):
"""Coloured checker frames so the encoder has real content."""
frames = np.zeros((t, h, w, 3), dtype=np.uint8)
for i in range(t):
frames[i, :, :, 0] = (i * 20) % 255
frames[i, :h // 2, :, 1] = 255
frames[i, :, :w // 2, 2] = 255
return frames
def test_frames_to_mp4_loop_produces_mp4_bytes():
frames = _rgb_frames(8)
data = frames_to_mp4_loop(frames, fps=16)
assert isinstance(data, bytes)
assert len(data) > 0
# MP4 files start with an ftyp box: 4 bytes size + 'ftyp'
assert data[4:8] == b"ftyp"
def test_frames_and_audio_to_mp4_produces_mp4_bytes():
frames = _rgb_frames(16)
# 1s silent audio at 24kHz
audio = np.zeros(24000, dtype=np.float32)
data = frames_and_audio_to_mp4(frames, audio, sample_rate=24000, fps=16)
assert isinstance(data, bytes)
assert len(data) > 0
assert data[4:8] == b"ftyp"
def test_frames_to_mp4_loop_rejects_empty():
with pytest.raises(ValueError):
frames_to_mp4_loop(np.empty((0, 64, 64, 3), dtype=np.uint8), fps=16)
def test_frames_and_audio_to_mp4_rejects_empty_audio():
frames = _rgb_frames(4)
with pytest.raises(ValueError):
frames_and_audio_to_mp4(
frames, np.empty(0, dtype=np.float32), sample_rate=24000, fps=16
)
def test_frames_to_mp4_loop_rejects_wrong_shape():
with pytest.raises(ValueError):
frames_to_mp4_loop(np.zeros((4, 64, 64), dtype=np.uint8), fps=16)
+144
View File
@@ -0,0 +1,144 @@
"""Unit test for the video-mode branch in ConversationSession.
Stubs every model involved (ASR, LLM, TTS, VideoEngine) so we can verify:
1. When video_engine is not ready, the existing PCM streaming path runs.
2. When video_engine IS ready, the per-chunk PCM sends are skipped and a
single ``speaking_clip`` JSON + MP4 binary is sent instead.
Pure asyncio; no CUDA, no real models.
"""
from __future__ import annotations
import asyncio
import types
from unittest.mock import MagicMock
import numpy as np
import pytest
from server.pipeline import ConversationSession
class _FakeVAD:
is_speaking = False
def process_chunk(self, _): return None
class _FakeASR:
def __init__(self, text="hello"):
self.text = text
def transcribe(self, _): return self.text
class _FakeLLM:
def __init__(self, response="Hi there."):
self.response = response
def generate(self, *_a, **_k):
return self.response, None
def trim_cache(self, state, _): return state
class _FakeTTSIterable:
"""Drop-in replacement for Kokoro's pipeline(..) generator."""
def __init__(self, chunks):
self._chunks = chunks
def __call__(self, segment, voice=None):
for i, audio in enumerate(self._chunks):
yield f"w{i}", None, audio
class _FakeTTSEngine:
def __init__(self, chunks):
self.pipeline = _FakeTTSIterable(chunks)
self.voice = "v"
self.sample_rate = 24000
class _FakeVideoEngineReady:
class _Cfg:
mode = "reflective"
cfg = _Cfg()
def __init__(self):
self.called_with = None
def is_ready(self): return True
def generate_speaking_clip(self, audio, sr, reply_text):
self.called_with = {"len": len(audio), "sr": sr, "reply": reply_text}
return b"FAKE_MP4_BYTES"
class _FakeModelsBase:
def __init__(self, tts_chunks):
self.asr_engine = _FakeASR()
self.llm_engine = _FakeLLM()
self.tts_engine = _FakeTTSEngine(tts_chunks)
def create_vad(self): return _FakeVAD()
class _FakeModelsStreaming(_FakeModelsBase):
video_engine = None
class _FakeModelsVideo(_FakeModelsBase):
def __init__(self, tts_chunks):
super().__init__(tts_chunks)
self.video_engine = _FakeVideoEngineReady()
@pytest.mark.asyncio
async def test_streaming_path_when_video_engine_absent():
json_sent: list = []
bytes_sent: list = []
async def send_json(d): json_sent.append(d)
async def send_bytes(b): bytes_sent.append(b)
chunks = [
np.ones(240, dtype=np.float32),
np.ones(480, dtype=np.float32),
]
models = _FakeModelsStreaming(tts_chunks=chunks)
session = ConversationSession(models, send_json, send_bytes)
await session._process_utterance(np.zeros(16000, dtype=np.float32))
# PCM bytes were sent (one per TTS chunk).
assert len(bytes_sent) == 2
# Per-chunk response_text messages were sent (not video's one-shot).
text_msgs = [m for m in json_sent if m.get("type") == "response_text"]
assert any(not m.get("final") for m in text_msgs)
# No speaking_clip envelope
assert not any(m.get("type") == "speaking_clip" for m in json_sent)
@pytest.mark.asyncio
async def test_video_path_when_engine_ready():
json_sent: list = []
bytes_sent: list = []
async def send_json(d): json_sent.append(d)
async def send_bytes(b): bytes_sent.append(b)
chunks = [
np.full(480, 0.5, dtype=np.float32),
np.full(480, 0.25, dtype=np.float32),
]
models = _FakeModelsVideo(tts_chunks=chunks)
session = ConversationSession(models, send_json, send_bytes)
await session._process_utterance(np.zeros(16000, dtype=np.float32))
# MP4 blob was sent once.
assert bytes_sent == [b"FAKE_MP4_BYTES"]
# speaking_clip envelope was sent exactly once.
envelopes = [m for m in json_sent if m.get("type") == "speaking_clip"]
assert len(envelopes) == 1
assert envelopes[0]["size_bytes"] == len(b"FAKE_MP4_BYTES")
assert envelopes[0]["text"] == "Hi there."
# The video engine received the concatenated audio.
ve = models.video_engine
assert ve.called_with is not None
assert ve.called_with["len"] == 960 # 480 + 480
assert ve.called_with["reply"] == "Hi there."
# No per-chunk PCM bytes were streamed (video path suppresses them).
# Only the MP4 blob is in bytes_sent.
assert len(bytes_sent) == 1
+119
View File
@@ -0,0 +1,119 @@
"""Unit tests for VideoConfig parsing and LoRASpec validation.
Pure-python, no model imports, no CUDA, no ffmpeg. Safe for Windows CI.
"""
import pytest
from server.video import VideoConfig, LoRASpec
def test_defaults_when_raw_is_empty():
cfg = VideoConfig.from_dict({})
assert cfg.enabled is False
assert cfg.backend == "lightx2v"
assert cfg.mode == "reflective"
assert cfg.resolution == 480
assert cfg.fps == 16
assert cfg.library_base_clip_count == 4
assert cfg.reflective_prompt_reply_words == 18
assert cfg.loras == []
def test_defaults_when_raw_is_none():
cfg = VideoConfig.from_dict(None) # type: ignore[arg-type]
assert cfg.enabled is False
def test_library_section_override():
cfg = VideoConfig.from_dict(
{"enabled": True, "mode": "library", "library": {"base_clip_count": 7, "base_clip_seconds": 3}}
)
assert cfg.enabled is True
assert cfg.mode == "library"
assert cfg.library_base_clip_count == 7
assert cfg.library_base_clip_seconds == 3
def test_reflective_section_override():
cfg = VideoConfig.from_dict(
{
"reflective": {
"clip_seconds": 9,
"clip_prompt_template": "my template: {reply_hint}",
"prompt_reply_words": 5,
}
}
)
assert cfg.reflective_clip_seconds == 9
assert cfg.reflective_prompt_template == "my template: {reply_hint}"
assert cfg.reflective_prompt_reply_words == 5
def test_lora_parse_minimal():
cfg = VideoConfig.from_dict({"loras": [{"path": "/tmp/a.safetensors"}]})
assert len(cfg.loras) == 1
lora = cfg.loras[0]
assert lora.path == "/tmp/a.safetensors"
assert lora.weight == 1.0
assert lora.target == "both"
assert lora.name is None
def test_lora_parse_full():
cfg = VideoConfig.from_dict(
{
"loras": [
{
"path": "/tmp/hi.safetensors",
"weight": 0.7,
"target": "high_noise",
"name": "hi-noise-style",
},
{
"path": "/tmp/lo.safetensors",
"weight": 0.4,
"target": "low_noise",
"name": "lo-noise-style",
},
]
}
)
assert len(cfg.loras) == 2
assert cfg.loras[0].target == "high_noise"
assert cfg.loras[0].name == "hi-noise-style"
assert cfg.loras[1].target == "low_noise"
assert cfg.loras[1].weight == 0.4
def test_lora_invalid_target_falls_back_to_both():
cfg = VideoConfig.from_dict(
{"loras": [{"path": "/tmp/x.safetensors", "target": "bogus"}]}
)
assert cfg.loras[0].target == "both"
def test_lora_entries_without_path_are_dropped():
cfg = VideoConfig.from_dict(
{"loras": [{"weight": 0.5}, {"path": "/tmp/ok.safetensors"}, None]}
)
assert len(cfg.loras) == 1
assert cfg.loras[0].path == "/tmp/ok.safetensors"
def test_models_section_override():
cfg = VideoConfig.from_dict(
{
"models": {
"wan22_base_repo": "/local/weights/wan22",
"wan22_fp8_repo": "/local/weights/wan22-fp8",
"wan22_config_json": "/local/cfg/fp8.json",
"wan22_model_cls": "wan2.2_moe",
"musetalk_path": "/local/weights/musetalk",
}
}
)
assert cfg.wan22_base_repo == "/local/weights/wan22"
assert cfg.wan22_fp8_repo == "/local/weights/wan22-fp8"
assert cfg.wan22_config_json == "/local/cfg/fp8.json"
assert cfg.wan22_model_cls == "wan2.2_moe"
assert cfg.musetalk_model_path == "/local/weights/musetalk"
+106
View File
@@ -0,0 +1,106 @@
"""Unit tests for pure-python logic inside VideoEngine.
No models are loaded: we instantiate ``VideoEngine`` and hand-stub its
``_wan22`` / ``_musetalk`` attributes to test prompt derivation, library
round-robin, and frame fitting.
"""
import numpy as np
import pytest
from server.video import VideoConfig, VideoEngine
@pytest.fixture
def engine():
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective",
"fps": 16,
"reflective": {
"clip_prompt_template": "A: {reply_hint} B",
"prompt_reply_words": 5,
},
}
)
return VideoEngine(cfg)
def test_derive_prompt_truncates_to_word_limit(engine):
out = engine._derive_prompt("one two three four five six seven eight")
assert out == "A: one two three four five B"
def test_derive_prompt_handles_empty_reply(engine):
out = engine._derive_prompt("")
assert out == "A: calm and friendly B"
out2 = engine._derive_prompt(None) # type: ignore[arg-type]
assert out2 == "A: calm and friendly B"
def test_derive_prompt_strips_and_passes_through(engine):
out = engine._derive_prompt(" hello world ")
assert out == "A: hello world B"
def test_is_ready_false_without_models(engine):
# Models haven't been loaded — is_ready must be False so the pipeline
# falls back to the PCM streaming path.
assert engine.is_ready() is False
def test_pick_library_frames_round_robin(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 2
# Two base clips, 4 frames each.
a = np.tile(np.array([[[[0, 0, 0]]]], dtype=np.uint8), (4, 1, 1, 1))
b = np.tile(np.array([[[[255, 255, 255]]]], dtype=np.uint8), (4, 1, 1, 1))
engine.speaking_base_frames = [a, b]
# 2s of audio at 16kHz → 4 frames at fps=2
audio = np.zeros(16000 * 2, dtype=np.float32)
f1 = engine._pick_library_frames(audio, 16000)
f2 = engine._pick_library_frames(audio, 16000)
f3 = engine._pick_library_frames(audio, 16000)
assert f1.shape == (4, 1, 1, 3)
assert f1[0, 0, 0, 0] == 0 # first pick = clip A
assert f2[0, 0, 0, 0] == 255 # second pick = clip B
assert f3[0, 0, 0, 0] == 0 # wraps back to A
def test_pick_library_frames_trims_to_audio_duration(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 4
frames = np.zeros((20, 1, 1, 3), dtype=np.uint8)
engine.speaking_base_frames = [frames]
# 1s audio → 4 frames
audio = np.zeros(16000, dtype=np.float32)
out = engine._pick_library_frames(audio, 16000)
assert out.shape == (4, 1, 1, 3)
def test_pick_library_frames_loops_for_long_audio(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 4
frames = np.zeros((4, 1, 1, 3), dtype=np.uint8)
engine.speaking_base_frames = [frames]
# 3s audio → 12 frames, base has only 4
audio = np.zeros(16000 * 3, dtype=np.float32)
out = engine._pick_library_frames(audio, 16000)
assert out.shape == (12, 1, 1, 3)
def test_pick_library_frames_raises_when_empty(engine):
engine.cfg.mode = "library"
engine.speaking_base_frames = []
with pytest.raises(RuntimeError, match="no pre-baked base clips"):
engine._pick_library_frames(np.zeros(100, dtype=np.float32), 16000)
def test_generate_speaking_clip_raises_when_not_ready(engine):
with pytest.raises(RuntimeError, match="not ready"):
engine.generate_speaking_clip(
audio_f32=np.zeros(100, dtype=np.float32),
sample_rate=16000,
reply_text="hi",
)