live-voice-chat/config.yml

# LLM backend: "local" or "lmstudio"
llm:
  backend: local        # change to "lmstudio" to use LM Studio instead
  max_cache_tokens: 4096  # max KV-cache size per session (tokens); 0 to disable caching

  system_prompt: >-
    You are a helpful voice assistant.
    Keep your responses extremely concise but natural for spoken conversation.
    Do not use markdown, bullet points, code blocks, emojis, or any formatting that doesn't work in speech.

  # Settings used only when backend = "lmstudio"
  lmstudio:
    url: http://host.docker.internal:1234   # host.docker.internal resolves to your PC from inside Docker
    model: ""           # leave empty to use whatever model LM Studio has loaded

# Avatar video generation (Wan2.2-TI2V-5B-Turbo GGUF via LightX2V + MuseTalk lip-sync)
video:
  enabled: true                  # master toggle — when false, video models are not loaded
  backend: lightx2v               # only option for now
  mode: reflective                # "library" (pre-baked clips) | "reflective" (fresh per turn)
  resolution: 480                 # 480 or 720
  fps: 16                         # Wan2.2 native rate; MuseTalk resamples as needed

  library:
    base_clip_count: 4            # how many speaking base clips to pre-generate per avatar
    base_clip_seconds: 6          # duration of each pre-baked clip

  # MuseTalk audio-driven lip-sync. When disabled, Wan2.2 base frames are
  # used as-is without a lip-sync pass — useful when MuseTalk isn't installed
  # or while iterating on the base pipeline.
  musetalk:
    enabled: false                # toggle lip-sync on/off

  reflective:
    clip_seconds: 5               # target length of each fresh Wan2.2 clip per turn
    clip_prompt_template: >-
      webcam view of a person speaking, {reply_hint},
      casual gestures, natural lighting, soft focus background
    prompt_reply_words: 18        # max words lifted from reply to inject as {reply_hint}

  # Model sources for the video stack. T5/VAE/tokenizer come from the
  # Wan-AI base repo. The single dense DIT comes from wan22_dit_repo as
  # GGUF (Turbo 4-step distill). Both repos download on first run into
  # HF_HOME=/cache/huggingface.
  #
  # Supported dit_quant_scheme values (dense 5B Turbo — GGUF only):
  #   gguf-Q8_0   — 8-bit, ~6 GB DIT, ~6.5 GB VRAM at load (default)
  #   gguf-Q4_K_M — 4-bit, ~3.5 GB DIT, lower VRAM for tight budgets
  #   (any gguf-<level> published in hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF)
  models:
    wan22_base_repo: Wan-AI/Wan2.2-TI2V-5B
    wan22_dit_repo: hum-ma/Wan2.2-TI2V-5B-Turbo-GGUF
    wan22_dit_quant_scheme: gguf-Q8_0
    wan22_t5_quantized: true
    wan22_model_cls: wan2.2
    wan22_config_json: /app/configs/lightx2v/wan22_i2v_gguf_5b_turbo.json
    musetalk_path: TMElyralab/MuseTalk

  # LoRAs applied to the dense 5B DIT at load time via LightX2V's
  # lora_dynamic_apply path (merged during GGUF dequant). Dense has a
  # single set of weights so `target` is always `both`.
  #
  # The old MoE-trained wan22-H-e8 / wan22-L-e8 LoRAs are NOT compatible
  # with the 5B DIT and are disabled here. Future 5B-compatible LoRAs
  # should follow the shape shown below.
  loras: []
  # loras:
  #   - path: /cache/loras/your-5b-lora.safetensors
  #     weight: 1.0
  #     target: both
  #     name: your-5b-lora