# LLM backend: "local" or "lmstudio" llm: backend: local # change to "lmstudio" to use LM Studio instead max_cache_tokens: 4096 # max KV-cache size per session (tokens); 0 to disable caching system_prompt: >- You are a helpful voice assistant. Keep your responses extremely concise but natural for spoken conversation. Do not use markdown, bullet points, code blocks, emojis, or any formatting that doesn't work in speech. # Settings used only when backend = "lmstudio" lmstudio: url: http://host.docker.internal:1234 # host.docker.internal resolves to your PC from inside Docker model: "" # leave empty to use whatever model LM Studio has loaded # Avatar video generation (Wan2.2-Lightning fp8 via LightX2V + MuseTalk lip-sync) video: enabled: false # master toggle — when false, video models are not loaded backend: lightx2v # only option for now mode: reflective # "library" (pre-baked clips) | "reflective" (fresh per turn) resolution: 480 # 480 or 720 fps: 16 # Wan2.2 native rate; MuseTalk resamples as needed library: base_clip_count: 4 # how many speaking base clips to pre-generate per avatar base_clip_seconds: 6 # duration of each pre-baked clip reflective: clip_seconds: 5 # target length of each fresh Wan2.2 clip per turn clip_prompt_template: >- webcam view of a person speaking, {reply_hint}, casual gestures, natural lighting, soft focus background prompt_reply_words: 18 # max words lifted from reply to inject as {reply_hint} # Model sources for the video stack. The fp8 e4m3 4-step distilled DIT # weights from lightx2v/Wan2.2-Distill-Models are ~15 GB each (vs ~28 GB # bf16) — that's the "save VRAM" path. T5/VAE/tokenizer still come from # the Wan-AI base repo. Both repos download on first run into # HF_HOME=/cache/huggingface. models: wan22_base_repo: Wan-AI/Wan2.2-I2V-A14B wan22_fp8_repo: lightx2v/Wan2.2-Distill-Models wan22_model_cls: wan2.2_moe_distill wan22_config_json: /app/configs/lightx2v/wan22_i2v_fp8_distill.json musetalk_path: TMElyralab/MuseTalk # LoRAs applied to the fp8 base at load time via runtime switch_lora. # Wan2.2 is a MoE with separate high-noise and low-noise sub-models — # `target` picks which sub-model each LoRA attaches to. The two files # below are the user-supplied ./loras/wan22-[HL]-e8.safetensors mounted # into the container at /cache/loras/. loras: - path: /cache/loras/wan22-H-e8.safetensors weight: 1.0 target: high_noise name: wan22-H-e8 - path: /cache/loras/wan22-L-e8.safetensors weight: 1.0 target: low_noise name: wan22-L-e8