working ok

This commit is contained in:
2026-04-16 10:00:37 -04:00
parent 9debc56137
commit 129df7d1fa
24 changed files with 674 additions and 539 deletions
+49 -37
View File
@@ -18,18 +18,18 @@ import numpy as np
log = logging.getLogger(__name__)
LoRATarget = Literal["high_noise", "low_noise", "both"]
LoRATarget = Literal["both"]
@dataclass
class LoRASpec:
"""One LoRA adapter entry from ``config.video.loras``.
Wan2.2 I2V is a Mixture-of-Experts model with separate high-noise and
low-noise sub-models. Most LightX2V distill LoRAs come paired (one per
sub-model) and must be applied to the correct target. Allow
``target="both"`` for LoRAs that should be applied to both sub-models
(e.g. style LoRAs).
The dense Wan2.2-TI2V-5B DIT has a single set of weights (no MoE
experts), so ``target`` is always ``"both"``. The field is kept for
forward compatibility and config-file compatibility with older MoE
configs — legacy ``"high_noise"`` / ``"low_noise"`` values are coerced
to ``"both"`` in ``VideoConfig.from_dict``.
"""
path: str
@@ -60,18 +60,20 @@ class VideoConfig:
# Model paths — can be overridden via config.yml.video.models.
# wan22_base_repo : HF repo id (or local dir) providing T5/VAE/tokenizer.
# The bf16 DIT shards in this repo are skipped — we
# replace them with quantised files from wan22_dit_repo.
# wan22_dit_repo : HF repo id (or local dir) providing the quantised
# DIT checkpoints (fp8 or GGUF).
# wan22_dit_quant_scheme : quantisation format, e.g. "fp8-sgl" or "gguf-Q4_K_M".
# replace them with a quantised GGUF from wan22_dit_repo.
# wan22_dit_repo : HF repo id (or local dir) providing the single
# dense GGUF DIT checkpoint (5B Turbo).
# wan22_dit_quant_scheme : GGUF quant level, e.g. "gguf-Q8_0" (default)
# or "gguf-Q4_K_M" for lower VRAM.
# wan22_config_json : path to the LightX2V inference config template the
# Wan22Pipeline will fill in with absolute ckpt paths.
wan22_base_repo: str = "Wan-AI/Wan2.2-I2V-A14B"
wan22_dit_repo: str = "lightx2v/Wan2.2-Distill-Models"
wan22_dit_quant_scheme: str = "fp8-sgl"
wan22_t5_quantized: bool = False
wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
wan22_model_cls: str = "wan2.2_moe_distill"
wan22_base_repo: str = "Wan-AI/Wan2.2-TI2V-5B"
wan22_dit_repo: str = "hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF"
wan22_dit_quant_scheme: str = "gguf-Q8_0"
wan22_t5_quantized: bool = True
wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json"
wan22_model_cls: str = "wan2.2"
musetalk_enabled: bool = True
musetalk_model_path: str = "TMElyralab/MuseTalk"
@classmethod
@@ -92,9 +94,10 @@ class VideoConfig:
if not entry or "path" not in entry:
continue
target = str(entry.get("target", "both")).lower()
if target not in ("high_noise", "low_noise", "both"):
if target != "both":
log.warning(
"LoRA %s: invalid target %r, defaulting to 'both'",
"LoRA %s: target %r is MoE-era; coercing to 'both' "
"(dense 5B has a single DIT).",
entry.get("path"), target,
)
target = "both"
@@ -122,30 +125,32 @@ class VideoConfig:
reflective_prompt_reply_words=int(reflective.get("prompt_reply_words", 18)),
loras=loras,
wan22_base_repo=str(
models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-I2V-A14B")
models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-TI2V-5B")
),
wan22_dit_repo=str(
models_raw.get(
"wan22_dit_repo",
# Backwards compat: fall back to old key name.
models_raw.get("wan22_fp8_repo", "lightx2v/Wan2.2-Distill-Models"),
"hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF",
)
),
wan22_dit_quant_scheme=str(
models_raw.get("wan22_dit_quant_scheme", "fp8-sgl")
models_raw.get("wan22_dit_quant_scheme", "gguf-Q8_0")
),
wan22_t5_quantized=bool(
models_raw.get("wan22_t5_quantized", False)
models_raw.get("wan22_t5_quantized", True)
),
wan22_config_json=str(
models_raw.get(
"wan22_config_json",
"/app/configs/lightx2v/wan22_i2v_fp8_distill.json",
"/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json",
)
),
wan22_model_cls=str(
models_raw.get("wan22_model_cls", "wan2.2_moe_distill")
models_raw.get("wan22_model_cls", "wan2.2")
),
musetalk_enabled=bool(raw.get("musetalk", {}).get("enabled", True))
if isinstance(raw.get("musetalk"), dict)
else bool(raw.get("musetalk_enabled", True)),
musetalk_model_path=str(
models_raw.get("musetalk_path", "TMElyralab/MuseTalk")
),
@@ -235,17 +240,22 @@ class VideoEngine:
self._wan22.load_loras(self.cfg.loras)
log.info("Wan2.2 pipeline ready.")
log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
log.info("MuseTalk engine ready.")
if self.cfg.musetalk_enabled:
log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
log.info("MuseTalk engine ready.")
else:
log.info("MuseTalk disabled via config — skipping lip-sync pass.")
self._musetalk = None
# --- Readiness ------------------------------------------------------
def is_ready(self) -> bool:
"""True when an avatar is set and a speaking clip can be produced."""
musetalk_ok = (not self.cfg.musetalk_enabled) or self._musetalk is not None
return (
self._wan22 is not None
and self._musetalk is not None
and musetalk_ok
and self.avatar_path is not None
and self.idle_clip_mp4 is not None
)
@@ -336,7 +346,6 @@ class VideoEngine:
"(avatar set? models loaded?)"
)
assert self._wan22 is not None
assert self._musetalk is not None
# 1. Source base frames.
if self.cfg.mode == "library":
@@ -351,13 +360,16 @@ class VideoEngine:
seed=None, # random each turn
)
# 2. Lip-sync the base frames to the given audio.
synced_frames = self._musetalk.lip_sync(
frames=base_frames,
audio=audio_f32,
sample_rate=sample_rate,
fps=self.cfg.fps,
)
# 2. Lip-sync the base frames to the given audio (if enabled).
if self._musetalk is not None:
synced_frames = self._musetalk.lip_sync(
frames=base_frames,
audio=audio_f32,
sample_rate=sample_rate,
fps=self.cfg.fps,
)
else:
synced_frames = base_frames
# 3. Mux frames + audio into an MP4.
from server.video_models.muxer import frames_and_audio_to_mp4