live-voice-chat/config.yml

# LLM backend: "local" or "lmstudio"
llm:
  backend: local        # change to "lmstudio" to use LM Studio instead
  max_cache_tokens: 4096  # max KV-cache size per session (tokens); 0 to disable caching

  system_prompt: >-
    You are a helpful voice assistant.
    Keep your responses extremely concise but natural for spoken conversation.
    Do not use markdown, bullet points, code blocks, emojis, or any formatting that doesn't work in speech.

  # Settings used only when backend = "lmstudio"
  lmstudio:
    url: http://host.docker.internal:1234   # host.docker.internal resolves to your PC from inside Docker
    model: ""           # leave empty to use whatever model LM Studio has loaded

# Avatar video generation (Wan2.2-Lightning fp8 via LightX2V + MuseTalk lip-sync)
video:
  enabled: false                  # master toggle — when false, video models are not loaded
  backend: lightx2v               # only option for now
  mode: reflective                # "library" (pre-baked clips) | "reflective" (fresh per turn)
  resolution: 480                 # 480 or 720
  fps: 16                         # Wan2.2 native rate; MuseTalk resamples as needed

  library:
    base_clip_count: 4            # how many speaking base clips to pre-generate per avatar
    base_clip_seconds: 6          # duration of each pre-baked clip

  reflective:
    clip_seconds: 5               # target length of each fresh Wan2.2 clip per turn
    clip_prompt_template: >-
      webcam view of a person speaking, {reply_hint},
      casual gestures, natural lighting, soft focus background
    prompt_reply_words: 18        # max words lifted from reply to inject as {reply_hint}

  # Model sources for the video stack. T5/VAE/tokenizer come from the
  # Wan-AI base repo. DIT weights come from wan22_dit_repo in the format
  # specified by wan22_dit_quant_scheme. Both repos download on first run
  # into HF_HOME=/cache/huggingface.
  #
  # Supported dit_quant_scheme values:
  #   fp8-sgl    — fp8 e4m3 safetensors (~15 GB/expert, from lightx2v/Wan2.2-Distill-Models)
  #   gguf-Q4_K_M — GGUF 4-bit (~9.65 GB/expert, from QuantStack/Wan2.2-I2V-A14B-GGUF)
  #   gguf-Q8_0  — GGUF 8-bit (~15.4 GB/expert)
  #   (any gguf-<level> supported by LightX2V — see base_model.py MM_WEIGHT_REGISTER)
  models:
    wan22_base_repo: Wan-AI/Wan2.2-I2V-A14B
    wan22_dit_repo: QuantStack/Wan2.2-I2V-A14B-GGUF
    wan22_dit_quant_scheme: gguf-Q4_K_M
    wan22_t5_quantized: true
    wan22_model_cls: wan2.2_moe_distill
    wan22_config_json: /app/configs/lightx2v/wan22_i2v_gguf_distill.json
    musetalk_path: TMElyralab/MuseTalk

  # LoRAs applied to the fp8 base at load time via runtime switch_lora.
  # Wan2.2 is a MoE with separate high-noise and low-noise sub-models —
  # `target` picks which sub-model each LoRA attaches to. The two files
  # below are the user-supplied ./loras/wan22-[HL]-e8.safetensors mounted
  # into the container at /cache/loras/.
  loras:
    - path: /cache/loras/wan22-H-e8.safetensors
      weight: 1.0
      target: high_noise
      name: wan22-H-e8
    - path: /cache/loras/wan22-L-e8.safetensors
      weight: 1.0
      target: low_noise
      name: wan22-L-e8