working ok

2026-04-16 10:00:37 -04:00
parent 9debc56137
commit 129df7d1fa
24 changed files with 674 additions and 539 deletions
@@ -18,18 +18,18 @@ import numpy as np

 log = logging.getLogger(__name__)

-LoRATarget = Literal["high_noise", "low_noise", "both"]
+LoRATarget = Literal["both"]


@dataclass
 class LoRASpec:
    """One LoRA adapter entry from ``config.video.loras``.

-    Wan2.2 I2V is a Mixture-of-Experts model with separate high-noise and
-    low-noise sub-models. Most LightX2V distill LoRAs come paired (one per
-    sub-model) and must be applied to the correct target. Allow
-    ``target="both"`` for LoRAs that should be applied to both sub-models
-    (e.g. style LoRAs).
+    The dense Wan2.2-TI2V-5B DIT has a single set of weights (no MoE
+    experts), so ``target`` is always ``"both"``. The field is kept for
+    forward compatibility and config-file compatibility with older MoE
+    configs — legacy ``"high_noise"`` / ``"low_noise"`` values are coerced
+    to ``"both"`` in ``VideoConfig.from_dict``.
    """

    path: str
@@ -60,18 +60,20 @@ class VideoConfig:
    # Model paths — can be overridden via config.yml.video.models.
    # wan22_base_repo     : HF repo id (or local dir) providing T5/VAE/tokenizer.
    #                       The bf16 DIT shards in this repo are skipped — we
-    #                       replace them with quantised files from wan22_dit_repo.
-    # wan22_dit_repo      : HF repo id (or local dir) providing the quantised
-    #                       DIT checkpoints (fp8 or GGUF).
-    # wan22_dit_quant_scheme : quantisation format, e.g. "fp8-sgl" or "gguf-Q4_K_M".
+    #                       replace them with a quantised GGUF from wan22_dit_repo.
+    # wan22_dit_repo      : HF repo id (or local dir) providing the single
+    #                       dense GGUF DIT checkpoint (5B Turbo).
+    # wan22_dit_quant_scheme : GGUF quant level, e.g. "gguf-Q8_0" (default)
+    #                       or "gguf-Q4_K_M" for lower VRAM.
    # wan22_config_json   : path to the LightX2V inference config template the
    #                       Wan22Pipeline will fill in with absolute ckpt paths.
-    wan22_base_repo: str = "Wan-AI/Wan2.2-I2V-A14B"
-    wan22_dit_repo: str = "lightx2v/Wan2.2-Distill-Models"
-    wan22_dit_quant_scheme: str = "fp8-sgl"
-    wan22_t5_quantized: bool = False
-    wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
-    wan22_model_cls: str = "wan2.2_moe_distill"
+    wan22_base_repo: str = "Wan-AI/Wan2.2-TI2V-5B"
+    wan22_dit_repo: str = "hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF"
+    wan22_dit_quant_scheme: str = "gguf-Q8_0"
+    wan22_t5_quantized: bool = True
+    wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json"
+    wan22_model_cls: str = "wan2.2"
+    musetalk_enabled: bool = True
    musetalk_model_path: str = "TMElyralab/MuseTalk"

    @classmethod
@@ -92,9 +94,10 @@ class VideoConfig:
            if not entry or "path" not in entry:
                continue
            target = str(entry.get("target", "both")).lower()
-            if target not in ("high_noise", "low_noise", "both"):
+            if target != "both":
                log.warning(
-                    "LoRA %s: invalid target %r, defaulting to 'both'",
+                    "LoRA %s: target %r is MoE-era; coercing to 'both' "
+                    "(dense 5B has a single DIT).",
                    entry.get("path"), target,
                )
                target = "both"
@@ -122,30 +125,32 @@ class VideoConfig:
            reflective_prompt_reply_words=int(reflective.get("prompt_reply_words", 18)),
            loras=loras,
            wan22_base_repo=str(
-                models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-I2V-A14B")
+                models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-TI2V-5B")
            ),
            wan22_dit_repo=str(
                models_raw.get(
                    "wan22_dit_repo",
-                    # Backwards compat: fall back to old key name.
-                    models_raw.get("wan22_fp8_repo", "lightx2v/Wan2.2-Distill-Models"),
+                    "hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF",
                )
            ),
            wan22_dit_quant_scheme=str(
-                models_raw.get("wan22_dit_quant_scheme", "fp8-sgl")
+                models_raw.get("wan22_dit_quant_scheme", "gguf-Q8_0")
            ),
            wan22_t5_quantized=bool(
-                models_raw.get("wan22_t5_quantized", False)
+                models_raw.get("wan22_t5_quantized", True)
            ),
            wan22_config_json=str(
                models_raw.get(
                    "wan22_config_json",
-                    "/app/configs/lightx2v/wan22_i2v_fp8_distill.json",
+                    "/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json",
                )
            ),
            wan22_model_cls=str(
-                models_raw.get("wan22_model_cls", "wan2.2_moe_distill")
+                models_raw.get("wan22_model_cls", "wan2.2")
            ),
+            musetalk_enabled=bool(raw.get("musetalk", {}).get("enabled", True))
+            if isinstance(raw.get("musetalk"), dict)
+            else bool(raw.get("musetalk_enabled", True)),
            musetalk_model_path=str(
                models_raw.get("musetalk_path", "TMElyralab/MuseTalk")
            ),
@@ -235,17 +240,22 @@ class VideoEngine:
            self._wan22.load_loras(self.cfg.loras)
        log.info("Wan2.2 pipeline ready.")

-        log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
-        self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
-        log.info("MuseTalk engine ready.")
+        if self.cfg.musetalk_enabled:
+            log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
+            self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
+            log.info("MuseTalk engine ready.")
+        else:
+            log.info("MuseTalk disabled via config — skipping lip-sync pass.")
+            self._musetalk = None

    # --- Readiness ------------------------------------------------------

    def is_ready(self) -> bool:
        """True when an avatar is set and a speaking clip can be produced."""
+        musetalk_ok = (not self.cfg.musetalk_enabled) or self._musetalk is not None
        return (
            self._wan22 is not None
-            and self._musetalk is not None
+            and musetalk_ok
            and self.avatar_path is not None
            and self.idle_clip_mp4 is not None
        )
@@ -336,7 +346,6 @@ class VideoEngine:
                "(avatar set? models loaded?)"
            )
        assert self._wan22 is not None
-        assert self._musetalk is not None

        # 1. Source base frames.
        if self.cfg.mode == "library":
@@ -351,13 +360,16 @@ class VideoEngine:
                seed=None,  # random each turn
            )

-        # 2. Lip-sync the base frames to the given audio.
-        synced_frames = self._musetalk.lip_sync(
-            frames=base_frames,
-            audio=audio_f32,
-            sample_rate=sample_rate,
-            fps=self.cfg.fps,
-        )
+        # 2. Lip-sync the base frames to the given audio (if enabled).
+        if self._musetalk is not None:
+            synced_frames = self._musetalk.lip_sync(
+                frames=base_frames,
+                audio=audio_f32,
+                sample_rate=sample_rate,
+                fps=self.cfg.fps,
+            )
+        else:
+            synced_frames = base_frames

        # 3. Mux frames + audio into an MP4.
        from server.video_models.muxer import frames_and_audio_to_mp4