t5 encoder fp8 seems to be working

2026-04-12 13:50:34 -04:00
parent 2818b41004
commit fcf0be38bc
13 changed files with 505 additions and 67 deletions
@@ -1,15 +1,22 @@
-"""Phase 2 component test: Wan2.2-Lightning fp8 pipeline + LoRA stacking.
+"""Phase 2 component test: Wan2.2 pipeline + LoRA stacking.

 Verifies:
- ``Wan22Pipeline`` loads successfully against the fp8 distill path
-  (exercises the real LightX2V set_config → init_runner flow).
+- ``Wan22Pipeline`` loads successfully (exercises the real LightX2V
+  set_config -> init_runner flow).
 - ``load_loras`` / ``unload_loras`` survive with the two user LoRAs at
  ``/cache/loras/wan22-[HL]-e8.safetensors``.

-Requires GPU and a first-run download of both HF repos (base support files
-~12 GB, fp8 DIT ~30 GB). If LightX2V isn't installed the test is skipped.
+Supports both fp8 and GGUF DIT quantisation.  Set the ``DIT_QUANT``
+environment variable to switch (default: ``fp8-sgl``).

-Run:
+    DIT_QUANT=gguf-Q4_K_M docker compose exec voice-chat \
+        python -m tests.component.test_02_wan22_loras
+
+Requires GPU and a first-run download of both HF repos (base support files
+~12 GB, DIT size depends on quant — fp8 ~30 GB, GGUF Q4_K_M ~19 GB).
+If LightX2V isn't installed the test is skipped.
+
+Run (default fp8):
    docker compose exec voice-chat python -m tests.component.test_02_wan22_loras
 """
 from __future__ import annotations
@@ -21,7 +28,17 @@ from tests.component._common import get_logger

 log = get_logger("test_02")

-CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
+# --- Quant-dependent defaults ------------------------------------------------
+
+DIT_QUANT = os.environ.get("DIT_QUANT", "fp8-sgl")
+
+if DIT_QUANT.startswith("gguf-"):
+    CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_gguf_distill.json"
+    DIT_REPO = "QuantStack/Wan2.2-I2V-A14B-GGUF"
+else:
+    CONFIG_JSON = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
+    DIT_REPO = "lightx2v/Wan2.2-Distill-Models"
+
 LORA_HIGH = "/cache/loras/wan22-H-e8.safetensors"
 LORA_LOW = "/cache/loras/wan22-L-e8.safetensors"

@@ -37,15 +54,16 @@ def run():
    from server.video import LoRASpec

    log.info("[case 1] Instantiate Wan22Pipeline "
-             "(first run downloads ~42 GB total)...")
+             "(quant=%s, dit_repo=%s)...", DIT_QUANT, DIT_REPO)
    try:
        pipe = Wan22Pipeline(
            base_repo="Wan-AI/Wan2.2-I2V-A14B",
-            fp8_repo="lightx2v/Wan2.2-Distill-Models",
+            dit_repo=DIT_REPO,
            config_json=CONFIG_JSON,
            model_cls="wan2.2_moe_distill",
            resolution=480,
            fps=16,
+            dit_quant_scheme=DIT_QUANT,
        )
    except Exception as e:
        log.error("FAIL: Wan22Pipeline construction raised: %s", e)
@@ -56,7 +74,7 @@ def run():
    log.info("  PASS: pipeline constructed")

    # --- LoRAs ---
-    log.info("[case 2] load_loras with empty list → no-op")
+    log.info("[case 2] load_loras with empty list -> no-op")
    pipe.load_loras([])
    log.info("  PASS")