live-voice-chat/tests/unit/test_video_engine_logic.py

"""Unit tests for pure-python logic inside VideoEngine.

No models are loaded: we instantiate ``VideoEngine`` and hand-stub its
``_wan22`` / ``_musetalk`` attributes to test prompt derivation, library
round-robin, and frame fitting.
"""
import numpy as np
import pytest

from server.video import VideoConfig, VideoEngine


@pytest.fixture
def engine():
    cfg = VideoConfig.from_dict(
        {
            "enabled": True,
            "mode": "reflective",
            "fps": 16,
            "reflective": {
                "clip_prompt_template": "A: {reply_hint} B",
                "prompt_reply_words": 5,
            },
        }
    )
    return VideoEngine(cfg)


def test_derive_prompt_truncates_to_word_limit(engine):
    out = engine._derive_prompt("one two three four five six seven eight")
    assert out == "A: one two three four five B"


def test_derive_prompt_handles_empty_reply(engine):
    out = engine._derive_prompt("")
    assert out == "A: calm and friendly B"
    out2 = engine._derive_prompt(None)  # type: ignore[arg-type]
    assert out2 == "A: calm and friendly B"


def test_derive_prompt_strips_and_passes_through(engine):
    out = engine._derive_prompt("  hello world  ")
    assert out == "A: hello world B"


def test_is_ready_false_without_models(engine):
    # Models haven't been loaded — is_ready must be False so the pipeline
    # falls back to the PCM streaming path.
    assert engine.is_ready() is False


def test_pick_library_frames_round_robin(engine):
    engine.cfg.mode = "library"
    engine.cfg.fps = 2
    # Two base clips, 4 frames each.
    a = np.tile(np.array([[[[0, 0, 0]]]], dtype=np.uint8), (4, 1, 1, 1))
    b = np.tile(np.array([[[[255, 255, 255]]]], dtype=np.uint8), (4, 1, 1, 1))
    engine.speaking_base_frames = [a, b]
    # 2s of audio at 16kHz → 4 frames at fps=2
    audio = np.zeros(16000 * 2, dtype=np.float32)

    f1 = engine._pick_library_frames(audio, 16000)
    f2 = engine._pick_library_frames(audio, 16000)
    f3 = engine._pick_library_frames(audio, 16000)
    assert f1.shape == (4, 1, 1, 3)
    assert f1[0, 0, 0, 0] == 0      # first pick = clip A
    assert f2[0, 0, 0, 0] == 255    # second pick = clip B
    assert f3[0, 0, 0, 0] == 0      # wraps back to A


def test_pick_library_frames_trims_to_audio_duration(engine):
    engine.cfg.mode = "library"
    engine.cfg.fps = 4
    frames = np.zeros((20, 1, 1, 3), dtype=np.uint8)
    engine.speaking_base_frames = [frames]
    # 1s audio → 4 frames
    audio = np.zeros(16000, dtype=np.float32)
    out = engine._pick_library_frames(audio, 16000)
    assert out.shape == (4, 1, 1, 3)


def test_pick_library_frames_loops_for_long_audio(engine):
    engine.cfg.mode = "library"
    engine.cfg.fps = 4
    frames = np.zeros((4, 1, 1, 3), dtype=np.uint8)
    engine.speaking_base_frames = [frames]
    # 3s audio → 12 frames, base has only 4
    audio = np.zeros(16000 * 3, dtype=np.float32)
    out = engine._pick_library_frames(audio, 16000)
    assert out.shape == (12, 1, 1, 3)


def test_pick_library_frames_raises_when_empty(engine):
    engine.cfg.mode = "library"
    engine.speaking_base_frames = []
    with pytest.raises(RuntimeError, match="no pre-baked base clips"):
        engine._pick_library_frames(np.zeros(100, dtype=np.float32), 16000)


def test_generate_speaking_clip_raises_when_not_ready(engine):
    with pytest.raises(RuntimeError, match="not ready"):
        engine.generate_speaking_clip(
            audio_f32=np.zeros(100, dtype=np.float32),
            sample_rate=16000,
            reply_text="hi",
        )