working ok
This commit is contained in:
+49
-37
@@ -18,18 +18,18 @@ import numpy as np
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
LoRATarget = Literal["high_noise", "low_noise", "both"]
|
||||
LoRATarget = Literal["both"]
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRASpec:
|
||||
"""One LoRA adapter entry from ``config.video.loras``.
|
||||
|
||||
Wan2.2 I2V is a Mixture-of-Experts model with separate high-noise and
|
||||
low-noise sub-models. Most LightX2V distill LoRAs come paired (one per
|
||||
sub-model) and must be applied to the correct target. Allow
|
||||
``target="both"`` for LoRAs that should be applied to both sub-models
|
||||
(e.g. style LoRAs).
|
||||
The dense Wan2.2-TI2V-5B DIT has a single set of weights (no MoE
|
||||
experts), so ``target`` is always ``"both"``. The field is kept for
|
||||
forward compatibility and config-file compatibility with older MoE
|
||||
configs — legacy ``"high_noise"`` / ``"low_noise"`` values are coerced
|
||||
to ``"both"`` in ``VideoConfig.from_dict``.
|
||||
"""
|
||||
|
||||
path: str
|
||||
@@ -60,18 +60,20 @@ class VideoConfig:
|
||||
# Model paths — can be overridden via config.yml.video.models.
|
||||
# wan22_base_repo : HF repo id (or local dir) providing T5/VAE/tokenizer.
|
||||
# The bf16 DIT shards in this repo are skipped — we
|
||||
# replace them with quantised files from wan22_dit_repo.
|
||||
# wan22_dit_repo : HF repo id (or local dir) providing the quantised
|
||||
# DIT checkpoints (fp8 or GGUF).
|
||||
# wan22_dit_quant_scheme : quantisation format, e.g. "fp8-sgl" or "gguf-Q4_K_M".
|
||||
# replace them with a quantised GGUF from wan22_dit_repo.
|
||||
# wan22_dit_repo : HF repo id (or local dir) providing the single
|
||||
# dense GGUF DIT checkpoint (5B Turbo).
|
||||
# wan22_dit_quant_scheme : GGUF quant level, e.g. "gguf-Q8_0" (default)
|
||||
# or "gguf-Q4_K_M" for lower VRAM.
|
||||
# wan22_config_json : path to the LightX2V inference config template the
|
||||
# Wan22Pipeline will fill in with absolute ckpt paths.
|
||||
wan22_base_repo: str = "Wan-AI/Wan2.2-I2V-A14B"
|
||||
wan22_dit_repo: str = "lightx2v/Wan2.2-Distill-Models"
|
||||
wan22_dit_quant_scheme: str = "fp8-sgl"
|
||||
wan22_t5_quantized: bool = False
|
||||
wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_fp8_distill.json"
|
||||
wan22_model_cls: str = "wan2.2_moe_distill"
|
||||
wan22_base_repo: str = "Wan-AI/Wan2.2-TI2V-5B"
|
||||
wan22_dit_repo: str = "hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF"
|
||||
wan22_dit_quant_scheme: str = "gguf-Q8_0"
|
||||
wan22_t5_quantized: bool = True
|
||||
wan22_config_json: str = "/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json"
|
||||
wan22_model_cls: str = "wan2.2"
|
||||
musetalk_enabled: bool = True
|
||||
musetalk_model_path: str = "TMElyralab/MuseTalk"
|
||||
|
||||
@classmethod
|
||||
@@ -92,9 +94,10 @@ class VideoConfig:
|
||||
if not entry or "path" not in entry:
|
||||
continue
|
||||
target = str(entry.get("target", "both")).lower()
|
||||
if target not in ("high_noise", "low_noise", "both"):
|
||||
if target != "both":
|
||||
log.warning(
|
||||
"LoRA %s: invalid target %r, defaulting to 'both'",
|
||||
"LoRA %s: target %r is MoE-era; coercing to 'both' "
|
||||
"(dense 5B has a single DIT).",
|
||||
entry.get("path"), target,
|
||||
)
|
||||
target = "both"
|
||||
@@ -122,30 +125,32 @@ class VideoConfig:
|
||||
reflective_prompt_reply_words=int(reflective.get("prompt_reply_words", 18)),
|
||||
loras=loras,
|
||||
wan22_base_repo=str(
|
||||
models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-I2V-A14B")
|
||||
models_raw.get("wan22_base_repo", "Wan-AI/Wan2.2-TI2V-5B")
|
||||
),
|
||||
wan22_dit_repo=str(
|
||||
models_raw.get(
|
||||
"wan22_dit_repo",
|
||||
# Backwards compat: fall back to old key name.
|
||||
models_raw.get("wan22_fp8_repo", "lightx2v/Wan2.2-Distill-Models"),
|
||||
"hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF",
|
||||
)
|
||||
),
|
||||
wan22_dit_quant_scheme=str(
|
||||
models_raw.get("wan22_dit_quant_scheme", "fp8-sgl")
|
||||
models_raw.get("wan22_dit_quant_scheme", "gguf-Q8_0")
|
||||
),
|
||||
wan22_t5_quantized=bool(
|
||||
models_raw.get("wan22_t5_quantized", False)
|
||||
models_raw.get("wan22_t5_quantized", True)
|
||||
),
|
||||
wan22_config_json=str(
|
||||
models_raw.get(
|
||||
"wan22_config_json",
|
||||
"/app/configs/lightx2v/wan22_i2v_fp8_distill.json",
|
||||
"/app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json",
|
||||
)
|
||||
),
|
||||
wan22_model_cls=str(
|
||||
models_raw.get("wan22_model_cls", "wan2.2_moe_distill")
|
||||
models_raw.get("wan22_model_cls", "wan2.2")
|
||||
),
|
||||
musetalk_enabled=bool(raw.get("musetalk", {}).get("enabled", True))
|
||||
if isinstance(raw.get("musetalk"), dict)
|
||||
else bool(raw.get("musetalk_enabled", True)),
|
||||
musetalk_model_path=str(
|
||||
models_raw.get("musetalk_path", "TMElyralab/MuseTalk")
|
||||
),
|
||||
@@ -235,17 +240,22 @@ class VideoEngine:
|
||||
self._wan22.load_loras(self.cfg.loras)
|
||||
log.info("Wan2.2 pipeline ready.")
|
||||
|
||||
log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
|
||||
self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
|
||||
log.info("MuseTalk engine ready.")
|
||||
if self.cfg.musetalk_enabled:
|
||||
log.info("Loading MuseTalk engine (%s)...", self.cfg.musetalk_model_path)
|
||||
self._musetalk = MuseTalkEngine(model_path=self.cfg.musetalk_model_path)
|
||||
log.info("MuseTalk engine ready.")
|
||||
else:
|
||||
log.info("MuseTalk disabled via config — skipping lip-sync pass.")
|
||||
self._musetalk = None
|
||||
|
||||
# --- Readiness ------------------------------------------------------
|
||||
|
||||
def is_ready(self) -> bool:
|
||||
"""True when an avatar is set and a speaking clip can be produced."""
|
||||
musetalk_ok = (not self.cfg.musetalk_enabled) or self._musetalk is not None
|
||||
return (
|
||||
self._wan22 is not None
|
||||
and self._musetalk is not None
|
||||
and musetalk_ok
|
||||
and self.avatar_path is not None
|
||||
and self.idle_clip_mp4 is not None
|
||||
)
|
||||
@@ -336,7 +346,6 @@ class VideoEngine:
|
||||
"(avatar set? models loaded?)"
|
||||
)
|
||||
assert self._wan22 is not None
|
||||
assert self._musetalk is not None
|
||||
|
||||
# 1. Source base frames.
|
||||
if self.cfg.mode == "library":
|
||||
@@ -351,13 +360,16 @@ class VideoEngine:
|
||||
seed=None, # random each turn
|
||||
)
|
||||
|
||||
# 2. Lip-sync the base frames to the given audio.
|
||||
synced_frames = self._musetalk.lip_sync(
|
||||
frames=base_frames,
|
||||
audio=audio_f32,
|
||||
sample_rate=sample_rate,
|
||||
fps=self.cfg.fps,
|
||||
)
|
||||
# 2. Lip-sync the base frames to the given audio (if enabled).
|
||||
if self._musetalk is not None:
|
||||
synced_frames = self._musetalk.lip_sync(
|
||||
frames=base_frames,
|
||||
audio=audio_f32,
|
||||
sample_rate=sample_rate,
|
||||
fps=self.cfg.fps,
|
||||
)
|
||||
else:
|
||||
synced_frames = base_frames
|
||||
|
||||
# 3. Mux frames + audio into an MP4.
|
||||
from server.video_models.muxer import frames_and_audio_to_mp4
|
||||
|
||||
Reference in New Issue
Block a user