live-voice-chat/Dockerfile

FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04

ENV DEBIAN_FRONTEND=noninteractive
ENV PYTHONUNBUFFERED=1
# HuggingFace model cache — mounted as a volume so models persist across runs
ENV HF_HOME=/cache/huggingface
# LoRA directory — users drop .safetensors files here and reference them
# from config.yml::video.loras. Bind-mounted via docker-compose.
ENV LORA_DIR=/cache/loras

RUN apt-get update && apt-get install -y \
    python3.11 \
    python3.11-dev \
    python3.11-venv \
    git \
    ffmpeg \
    curl \
    && rm -rf /var/lib/apt/lists/*

# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11

RUN ln -sf /usr/bin/python3.11 /usr/bin/python

WORKDIR /app

# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support)
RUN python3.11 -m pip install --no-cache-dir \
    torch \
    --index-url https://download.pytorch.org/whl/cu128

# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source)
RUN python3.11 -m pip install --no-cache-dir \
    "auto-gptq>=0.7.1" \
    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/

# Install the rest of the app requirements
COPY requirements.txt .
RUN python3.11 -m pip install --no-cache-dir -r requirements.txt

# Pre-download the spacy model that kokoro needs at runtime
RUN python3.11 -m spacy download en_core_web_sm

# --- Optional: avatar video stack -------------------------------------------
# These are heavy installs; keep them after the core deps so rebuilds only
# redo this layer when ONLY the video stack changes. If you don't plan to
# use config.video.enabled=true, you can comment this block out to speed
# up builds and shrink the image.
#
# LightX2V (Wan2.2-Lightning inference framework) — installed from source
# since there is no stable PyPI release yet.
RUN python3.11 -m pip install --no-cache-dir \
    "git+https://github.com/ModelTC/LightX2V.git" || \
    echo "LightX2V install failed — config.video.enabled must stay false until fixed"
#
# sgl-kernel (fp8 T5 encoder acceleration). The PyPI wheel lacks SM120
# (Blackwell) CUTLASS kernels; use SGLang's cu128 wheel index instead.
# Our wan22.py patches fp8_scaled_mm → torch._scaled_mm at runtime for
# Blackwell GPUs, but the sgl_kernel package itself must still be present.
RUN python3.11 -m pip install --no-cache-dir --no-deps \
    "sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.14.post1/sgl_kernel-0.3.14.post1%2Bcu128-cp310-abi3-manylinux2014_x86_64.whl" || \
    echo "sgl-kernel install failed — fp8 T5 will fall back to bf16"
#
# MuseTalk (audio-driven lip-sync) — same story.
RUN python3.11 -m pip install --no-cache-dir \
    "git+https://github.com/TMElyralab/MuseTalk.git" || \
    echo "MuseTalk install failed — config.video.enabled must stay false until fixed"
#
# LoRA directory (user drops .safetensors here; bind-mounted in compose).
RUN mkdir -p /cache/loras

COPY . .

EXPOSE 8000

CMD ["python3.11", "run.py"]