live-voice-chat/tests/component/test_12_dit_single_step.py

"""Smoke test: single DIT denoising step with GGUF weights.

Builds the pipeline, runs all encoders, initializes the scheduler, then
executes exactly one DIT forward pass (step_pre → infer → step_post).
This isolates the GGUF fp16 DIT from the rest of the pipeline.

Run:
    docker compose exec -e DIT_QUANT=gguf-Q4_K_M voice-chat \
        python -m tests.component.test_12_dit_single_step
"""
from __future__ import annotations

import copy
import os
import sys

import torch

from tests.component._common import ensure_sample_avatar, get_logger

log = get_logger("test_12")

DIT_QUANT = os.environ.get("DIT_QUANT", "gguf-Q8_0")
CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json"
DIT_REPO = "hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF"


def run():
    try:
        from server.video_models.wan22 import Wan22Pipeline
    except ImportError as e:
        log.error("Import failed: %s", e)
        sys.exit(0)

    avatar = ensure_sample_avatar()
    log.info("Avatar: %s", avatar)

    log.info("Building pipeline (quant=%s)...", DIT_QUANT)
    pipe = Wan22Pipeline(
        base_repo="Wan-AI/Wan2.2-TI2V-5B",
        dit_repo=DIT_REPO,
        config_json=CONFIG_JSON,
        model_cls="wan2.2",
        resolution=480,
        fps=16,
        dit_quant_scheme=DIT_QUANT,
        t5_quantized=True,
    )
    log.info("Pipeline ready.")

    runner = pipe._runner

    # Set up input_info for a short clip
    from lightx2v.utils.input_info import update_input_info_from_dict
    update_input_info_from_dict(
        pipe._input_info_template,
        {
            "seed": 42,
            "prompt": "a person looking at the camera, natural lighting",
            "negative_prompt": "",
            "image_path": avatar,
            "target_video_length": 17,  # 1 second at 16fps + 1
        },
    )
    runner.input_info = pipe._input_info_template

    # 1. Run all encoders (T5 + CLIP + VAE)
    log.info("Running all input encoders (T5 + CLIP + VAE)...")
    runner.inputs = runner.run_input_encoder()
    log.info("Encoder outputs ready.")
    for k, v in runner.inputs.items():
        if isinstance(v, torch.Tensor):
            log.info("  inputs[%s]: shape=%s dtype=%s", k, v.shape, v.dtype)
        elif isinstance(v, dict):
            for k2, v2 in v.items():
                if isinstance(v2, torch.Tensor):
                    log.info("  inputs[%s][%s]: shape=%s dtype=%s", k, k2, v2.shape, v2.dtype)

    # 2. Initialize run (sets up scheduler, creates noise latents)
    log.info("Initializing run (scheduler.prepare)...")
    runner.init_run()
    latents = runner.model.scheduler.latents
    log.info("Initial latents: shape=%s dtype=%s", latents.shape, latents.dtype)

    # 3. Single DIT step
    log.info("Running single DIT step (step_pre → infer → step_post)...")
    runner.model.scheduler.step_pre(step_index=0)
    runner.model.infer(runner.inputs)
    runner.model.scheduler.step_post()

    latents_after = runner.model.scheduler.latents
    log.info("Latents after step: shape=%s dtype=%s", latents_after.shape, latents_after.dtype)

    # Verify latents changed (denoising did something)
    assert not torch.equal(latents, latents_after), "Latents unchanged after DIT step"
    log.info("PASS: DIT single step completed, latents updated.")

    log.info("PASS: DIT forward pass succeeded under %s pipeline.", DIT_QUANT)


if __name__ == "__main__":
    run()