first stab at adding video

2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
@@ -12,3 +12,49 @@ llm:
  lmstudio:
    url: http://host.docker.internal:1234   # host.docker.internal resolves to your PC from inside Docker
    model: ""           # leave empty to use whatever model LM Studio has loaded
+
+# Avatar video generation (Wan2.2-Lightning fp8 via LightX2V + MuseTalk lip-sync)
+video:
+  enabled: false                  # master toggle — when false, video models are not loaded
+  backend: lightx2v               # only option for now
+  mode: reflective                # "library" (pre-baked clips) | "reflective" (fresh per turn)
+  resolution: 480                 # 480 or 720
+  fps: 16                         # Wan2.2 native rate; MuseTalk resamples as needed
+
+  library:
+    base_clip_count: 4            # how many speaking base clips to pre-generate per avatar
+    base_clip_seconds: 6          # duration of each pre-baked clip
+
+  reflective:
+    clip_seconds: 5               # target length of each fresh Wan2.2 clip per turn
+    clip_prompt_template: >-
+      webcam view of a person speaking, {reply_hint},
+      casual gestures, natural lighting, soft focus background
+    prompt_reply_words: 18        # max words lifted from reply to inject as {reply_hint}
+
+  # Model sources for the video stack. The fp8 e4m3 4-step distilled DIT
+  # weights from lightx2v/Wan2.2-Distill-Models are ~15 GB each (vs ~28 GB
+  # bf16) — that's the "save VRAM" path. T5/VAE/tokenizer still come from
+  # the Wan-AI base repo. Both repos download on first run into
+  # HF_HOME=/cache/huggingface.
+  models:
+    wan22_base_repo: Wan-AI/Wan2.2-I2V-A14B
+    wan22_fp8_repo: lightx2v/Wan2.2-Distill-Models
+    wan22_model_cls: wan2.2_moe_distill
+    wan22_config_json: /app/configs/lightx2v/wan22_i2v_fp8_distill.json
+    musetalk_path: TMElyralab/MuseTalk
+
+  # LoRAs applied to the fp8 base at load time via runtime switch_lora.
+  # Wan2.2 is a MoE with separate high-noise and low-noise sub-models —
+  # `target` picks which sub-model each LoRA attaches to. The two files
+  # below are the user-supplied ./loras/wan22-[HL]-e8.safetensors mounted
+  # into the container at /cache/loras/.
+  loras:
+    - path: /cache/loras/wan22-H-e8.safetensors
+      weight: 1.0
+      target: high_noise
+      name: wan22-H-e8
+    - path: /cache/loras/wan22-L-e8.safetensors
+      weight: 1.0
+      target: low_noise
+      name: wan22-L-e8