first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
View File
+65
View File
@@ -0,0 +1,65 @@
"""Unit tests for the frame-length fitting helper in server.video_models.musetalk.
Pure-python: does not import MuseTalk itself.
"""
import numpy as np
from server.video_models.musetalk import _fit_frames_to_length, _ensure_uint8_rgb
def _make_frames(t, h=2, w=2):
return np.arange(t * h * w * 3, dtype=np.uint8).reshape(t, h, w, 3)
def test_fit_frames_trim():
frames = _make_frames(10)
out = _fit_frames_to_length(frames, 4)
assert out.shape == (4, 2, 2, 3)
np.testing.assert_array_equal(out, frames[:4])
def test_fit_frames_passthrough_when_equal():
frames = _make_frames(5)
out = _fit_frames_to_length(frames, 5)
assert out is frames or np.array_equal(out, frames)
def test_fit_frames_extends_with_pingpong():
frames = _make_frames(3)
out = _fit_frames_to_length(frames, 8)
assert out.shape == (8, 2, 2, 3)
# First 3 frames match the original
np.testing.assert_array_equal(out[:3], frames)
# Next 3 are the reverse (ping-pong)
np.testing.assert_array_equal(out[3:6], frames[::-1])
# Then forward again
np.testing.assert_array_equal(out[6:8], frames[:2])
def test_fit_frames_zero_target_returns_original():
frames = _make_frames(3)
out = _fit_frames_to_length(frames, 0)
np.testing.assert_array_equal(out, frames)
def test_ensure_uint8_rgb_from_float():
arr = np.ones((5, 2, 2, 3), dtype=np.float32) * 0.5
out = _ensure_uint8_rgb(arr)
assert out.dtype == np.uint8
assert out.shape == (5, 2, 2, 3)
assert out[0, 0, 0, 0] == 127
def test_ensure_uint8_rgb_promotes_3d_to_4d():
arr = np.zeros((2, 2, 3), dtype=np.uint8)
out = _ensure_uint8_rgb(arr)
assert out.shape == (1, 2, 2, 3)
def test_ensure_uint8_rgb_clips_float_out_of_range():
arr = np.ones((1, 1, 1, 3), dtype=np.float32) * 2.0 # 2.0 → clipped to 255
out = _ensure_uint8_rgb(arr)
assert out[0, 0, 0, 0] == 255
arr2 = np.ones((1, 1, 1, 3), dtype=np.float32) * -1.0
out2 = _ensure_uint8_rgb(arr2)
assert out2[0, 0, 0, 0] == 0
+67
View File
@@ -0,0 +1,67 @@
"""Unit tests for the ffmpeg muxer.
Requires ``ffmpeg`` on PATH. On Windows, if ffmpeg is not installed these
tests are skipped (they will run inside the Docker image where ffmpeg is
always present).
"""
import os
import shutil
import struct
import numpy as np
import pytest
from server.video_models.muxer import frames_and_audio_to_mp4, frames_to_mp4_loop
pytestmark = pytest.mark.skipif(
shutil.which("ffmpeg") is None,
reason="ffmpeg not installed locally; run these inside Docker",
)
def _rgb_frames(t, h=64, w=64):
"""Coloured checker frames so the encoder has real content."""
frames = np.zeros((t, h, w, 3), dtype=np.uint8)
for i in range(t):
frames[i, :, :, 0] = (i * 20) % 255
frames[i, :h // 2, :, 1] = 255
frames[i, :, :w // 2, 2] = 255
return frames
def test_frames_to_mp4_loop_produces_mp4_bytes():
frames = _rgb_frames(8)
data = frames_to_mp4_loop(frames, fps=16)
assert isinstance(data, bytes)
assert len(data) > 0
# MP4 files start with an ftyp box: 4 bytes size + 'ftyp'
assert data[4:8] == b"ftyp"
def test_frames_and_audio_to_mp4_produces_mp4_bytes():
frames = _rgb_frames(16)
# 1s silent audio at 24kHz
audio = np.zeros(24000, dtype=np.float32)
data = frames_and_audio_to_mp4(frames, audio, sample_rate=24000, fps=16)
assert isinstance(data, bytes)
assert len(data) > 0
assert data[4:8] == b"ftyp"
def test_frames_to_mp4_loop_rejects_empty():
with pytest.raises(ValueError):
frames_to_mp4_loop(np.empty((0, 64, 64, 3), dtype=np.uint8), fps=16)
def test_frames_and_audio_to_mp4_rejects_empty_audio():
frames = _rgb_frames(4)
with pytest.raises(ValueError):
frames_and_audio_to_mp4(
frames, np.empty(0, dtype=np.float32), sample_rate=24000, fps=16
)
def test_frames_to_mp4_loop_rejects_wrong_shape():
with pytest.raises(ValueError):
frames_to_mp4_loop(np.zeros((4, 64, 64), dtype=np.uint8), fps=16)
+144
View File
@@ -0,0 +1,144 @@
"""Unit test for the video-mode branch in ConversationSession.
Stubs every model involved (ASR, LLM, TTS, VideoEngine) so we can verify:
1. When video_engine is not ready, the existing PCM streaming path runs.
2. When video_engine IS ready, the per-chunk PCM sends are skipped and a
single ``speaking_clip`` JSON + MP4 binary is sent instead.
Pure asyncio; no CUDA, no real models.
"""
from __future__ import annotations
import asyncio
import types
from unittest.mock import MagicMock
import numpy as np
import pytest
from server.pipeline import ConversationSession
class _FakeVAD:
is_speaking = False
def process_chunk(self, _): return None
class _FakeASR:
def __init__(self, text="hello"):
self.text = text
def transcribe(self, _): return self.text
class _FakeLLM:
def __init__(self, response="Hi there."):
self.response = response
def generate(self, *_a, **_k):
return self.response, None
def trim_cache(self, state, _): return state
class _FakeTTSIterable:
"""Drop-in replacement for Kokoro's pipeline(..) generator."""
def __init__(self, chunks):
self._chunks = chunks
def __call__(self, segment, voice=None):
for i, audio in enumerate(self._chunks):
yield f"w{i}", None, audio
class _FakeTTSEngine:
def __init__(self, chunks):
self.pipeline = _FakeTTSIterable(chunks)
self.voice = "v"
self.sample_rate = 24000
class _FakeVideoEngineReady:
class _Cfg:
mode = "reflective"
cfg = _Cfg()
def __init__(self):
self.called_with = None
def is_ready(self): return True
def generate_speaking_clip(self, audio, sr, reply_text):
self.called_with = {"len": len(audio), "sr": sr, "reply": reply_text}
return b"FAKE_MP4_BYTES"
class _FakeModelsBase:
def __init__(self, tts_chunks):
self.asr_engine = _FakeASR()
self.llm_engine = _FakeLLM()
self.tts_engine = _FakeTTSEngine(tts_chunks)
def create_vad(self): return _FakeVAD()
class _FakeModelsStreaming(_FakeModelsBase):
video_engine = None
class _FakeModelsVideo(_FakeModelsBase):
def __init__(self, tts_chunks):
super().__init__(tts_chunks)
self.video_engine = _FakeVideoEngineReady()
@pytest.mark.asyncio
async def test_streaming_path_when_video_engine_absent():
json_sent: list = []
bytes_sent: list = []
async def send_json(d): json_sent.append(d)
async def send_bytes(b): bytes_sent.append(b)
chunks = [
np.ones(240, dtype=np.float32),
np.ones(480, dtype=np.float32),
]
models = _FakeModelsStreaming(tts_chunks=chunks)
session = ConversationSession(models, send_json, send_bytes)
await session._process_utterance(np.zeros(16000, dtype=np.float32))
# PCM bytes were sent (one per TTS chunk).
assert len(bytes_sent) == 2
# Per-chunk response_text messages were sent (not video's one-shot).
text_msgs = [m for m in json_sent if m.get("type") == "response_text"]
assert any(not m.get("final") for m in text_msgs)
# No speaking_clip envelope
assert not any(m.get("type") == "speaking_clip" for m in json_sent)
@pytest.mark.asyncio
async def test_video_path_when_engine_ready():
json_sent: list = []
bytes_sent: list = []
async def send_json(d): json_sent.append(d)
async def send_bytes(b): bytes_sent.append(b)
chunks = [
np.full(480, 0.5, dtype=np.float32),
np.full(480, 0.25, dtype=np.float32),
]
models = _FakeModelsVideo(tts_chunks=chunks)
session = ConversationSession(models, send_json, send_bytes)
await session._process_utterance(np.zeros(16000, dtype=np.float32))
# MP4 blob was sent once.
assert bytes_sent == [b"FAKE_MP4_BYTES"]
# speaking_clip envelope was sent exactly once.
envelopes = [m for m in json_sent if m.get("type") == "speaking_clip"]
assert len(envelopes) == 1
assert envelopes[0]["size_bytes"] == len(b"FAKE_MP4_BYTES")
assert envelopes[0]["text"] == "Hi there."
# The video engine received the concatenated audio.
ve = models.video_engine
assert ve.called_with is not None
assert ve.called_with["len"] == 960 # 480 + 480
assert ve.called_with["reply"] == "Hi there."
# No per-chunk PCM bytes were streamed (video path suppresses them).
# Only the MP4 blob is in bytes_sent.
assert len(bytes_sent) == 1
+119
View File
@@ -0,0 +1,119 @@
"""Unit tests for VideoConfig parsing and LoRASpec validation.
Pure-python, no model imports, no CUDA, no ffmpeg. Safe for Windows CI.
"""
import pytest
from server.video import VideoConfig, LoRASpec
def test_defaults_when_raw_is_empty():
cfg = VideoConfig.from_dict({})
assert cfg.enabled is False
assert cfg.backend == "lightx2v"
assert cfg.mode == "reflective"
assert cfg.resolution == 480
assert cfg.fps == 16
assert cfg.library_base_clip_count == 4
assert cfg.reflective_prompt_reply_words == 18
assert cfg.loras == []
def test_defaults_when_raw_is_none():
cfg = VideoConfig.from_dict(None) # type: ignore[arg-type]
assert cfg.enabled is False
def test_library_section_override():
cfg = VideoConfig.from_dict(
{"enabled": True, "mode": "library", "library": {"base_clip_count": 7, "base_clip_seconds": 3}}
)
assert cfg.enabled is True
assert cfg.mode == "library"
assert cfg.library_base_clip_count == 7
assert cfg.library_base_clip_seconds == 3
def test_reflective_section_override():
cfg = VideoConfig.from_dict(
{
"reflective": {
"clip_seconds": 9,
"clip_prompt_template": "my template: {reply_hint}",
"prompt_reply_words": 5,
}
}
)
assert cfg.reflective_clip_seconds == 9
assert cfg.reflective_prompt_template == "my template: {reply_hint}"
assert cfg.reflective_prompt_reply_words == 5
def test_lora_parse_minimal():
cfg = VideoConfig.from_dict({"loras": [{"path": "/tmp/a.safetensors"}]})
assert len(cfg.loras) == 1
lora = cfg.loras[0]
assert lora.path == "/tmp/a.safetensors"
assert lora.weight == 1.0
assert lora.target == "both"
assert lora.name is None
def test_lora_parse_full():
cfg = VideoConfig.from_dict(
{
"loras": [
{
"path": "/tmp/hi.safetensors",
"weight": 0.7,
"target": "high_noise",
"name": "hi-noise-style",
},
{
"path": "/tmp/lo.safetensors",
"weight": 0.4,
"target": "low_noise",
"name": "lo-noise-style",
},
]
}
)
assert len(cfg.loras) == 2
assert cfg.loras[0].target == "high_noise"
assert cfg.loras[0].name == "hi-noise-style"
assert cfg.loras[1].target == "low_noise"
assert cfg.loras[1].weight == 0.4
def test_lora_invalid_target_falls_back_to_both():
cfg = VideoConfig.from_dict(
{"loras": [{"path": "/tmp/x.safetensors", "target": "bogus"}]}
)
assert cfg.loras[0].target == "both"
def test_lora_entries_without_path_are_dropped():
cfg = VideoConfig.from_dict(
{"loras": [{"weight": 0.5}, {"path": "/tmp/ok.safetensors"}, None]}
)
assert len(cfg.loras) == 1
assert cfg.loras[0].path == "/tmp/ok.safetensors"
def test_models_section_override():
cfg = VideoConfig.from_dict(
{
"models": {
"wan22_base_repo": "/local/weights/wan22",
"wan22_fp8_repo": "/local/weights/wan22-fp8",
"wan22_config_json": "/local/cfg/fp8.json",
"wan22_model_cls": "wan2.2_moe",
"musetalk_path": "/local/weights/musetalk",
}
}
)
assert cfg.wan22_base_repo == "/local/weights/wan22"
assert cfg.wan22_fp8_repo == "/local/weights/wan22-fp8"
assert cfg.wan22_config_json == "/local/cfg/fp8.json"
assert cfg.wan22_model_cls == "wan2.2_moe"
assert cfg.musetalk_model_path == "/local/weights/musetalk"
+106
View File
@@ -0,0 +1,106 @@
"""Unit tests for pure-python logic inside VideoEngine.
No models are loaded: we instantiate ``VideoEngine`` and hand-stub its
``_wan22`` / ``_musetalk`` attributes to test prompt derivation, library
round-robin, and frame fitting.
"""
import numpy as np
import pytest
from server.video import VideoConfig, VideoEngine
@pytest.fixture
def engine():
cfg = VideoConfig.from_dict(
{
"enabled": True,
"mode": "reflective",
"fps": 16,
"reflective": {
"clip_prompt_template": "A: {reply_hint} B",
"prompt_reply_words": 5,
},
}
)
return VideoEngine(cfg)
def test_derive_prompt_truncates_to_word_limit(engine):
out = engine._derive_prompt("one two three four five six seven eight")
assert out == "A: one two three four five B"
def test_derive_prompt_handles_empty_reply(engine):
out = engine._derive_prompt("")
assert out == "A: calm and friendly B"
out2 = engine._derive_prompt(None) # type: ignore[arg-type]
assert out2 == "A: calm and friendly B"
def test_derive_prompt_strips_and_passes_through(engine):
out = engine._derive_prompt(" hello world ")
assert out == "A: hello world B"
def test_is_ready_false_without_models(engine):
# Models haven't been loaded — is_ready must be False so the pipeline
# falls back to the PCM streaming path.
assert engine.is_ready() is False
def test_pick_library_frames_round_robin(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 2
# Two base clips, 4 frames each.
a = np.tile(np.array([[[[0, 0, 0]]]], dtype=np.uint8), (4, 1, 1, 1))
b = np.tile(np.array([[[[255, 255, 255]]]], dtype=np.uint8), (4, 1, 1, 1))
engine.speaking_base_frames = [a, b]
# 2s of audio at 16kHz → 4 frames at fps=2
audio = np.zeros(16000 * 2, dtype=np.float32)
f1 = engine._pick_library_frames(audio, 16000)
f2 = engine._pick_library_frames(audio, 16000)
f3 = engine._pick_library_frames(audio, 16000)
assert f1.shape == (4, 1, 1, 3)
assert f1[0, 0, 0, 0] == 0 # first pick = clip A
assert f2[0, 0, 0, 0] == 255 # second pick = clip B
assert f3[0, 0, 0, 0] == 0 # wraps back to A
def test_pick_library_frames_trims_to_audio_duration(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 4
frames = np.zeros((20, 1, 1, 3), dtype=np.uint8)
engine.speaking_base_frames = [frames]
# 1s audio → 4 frames
audio = np.zeros(16000, dtype=np.float32)
out = engine._pick_library_frames(audio, 16000)
assert out.shape == (4, 1, 1, 3)
def test_pick_library_frames_loops_for_long_audio(engine):
engine.cfg.mode = "library"
engine.cfg.fps = 4
frames = np.zeros((4, 1, 1, 3), dtype=np.uint8)
engine.speaking_base_frames = [frames]
# 3s audio → 12 frames, base has only 4
audio = np.zeros(16000 * 3, dtype=np.float32)
out = engine._pick_library_frames(audio, 16000)
assert out.shape == (12, 1, 1, 3)
def test_pick_library_frames_raises_when_empty(engine):
engine.cfg.mode = "library"
engine.speaking_base_frames = []
with pytest.raises(RuntimeError, match="no pre-baked base clips"):
engine._pick_library_frames(np.zeros(100, dtype=np.float32), 16000)
def test_generate_speaking_clip_raises_when_not_ready(engine):
with pytest.raises(RuntimeError, match="not ready"):
engine.generate_speaking_clip(
audio_f32=np.zeros(100, dtype=np.float32),
sample_rate=16000,
reply_text="hi",
)