FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04 ENV DEBIAN_FRONTEND=noninteractive ENV PYTHONUNBUFFERED=1 # HuggingFace model cache — mounted as a volume so models persist across runs ENV HF_HOME=/cache/huggingface # LoRA directory — users drop .safetensors files here and reference them # from config.yml::video.loras. Bind-mounted via docker-compose. ENV LORA_DIR=/cache/loras # Prevent PyTorch/OpenMP/MKL thread pools from spin-waiting when idle. # Without this, loading large models (ASR, LLM, Wan2.2) causes all CPU cores # to busy-loop even with no connected clients, slowing the whole system. ENV OMP_WAIT_POLICY=PASSIVE ENV MKL_WAIT_POLICY=PASSIVE ENV TOKENIZERS_PARALLELISM=false RUN apt-get update && apt-get install -y \ python3.11 \ python3.11-dev \ python3.11-venv \ git \ ffmpeg \ curl \ && rm -rf /var/lib/apt/lists/* # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python) RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 RUN ln -sf /usr/bin/python3.11 /usr/bin/python WORKDIR /app # Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support) RUN python3.11 -m pip install --no-cache-dir \ torch \ --index-url https://download.pytorch.org/whl/cu128 # Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source) RUN python3.11 -m pip install --no-cache-dir \ "auto-gptq>=0.7.1" \ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/ # Install the rest of the app requirements COPY requirements.txt . RUN python3.11 -m pip install --no-cache-dir -r requirements.txt # Pre-download the spacy model that kokoro needs at runtime RUN python3.11 -m spacy download en_core_web_sm # --- Optional: avatar video stack ------------------------------------------- # These are heavy installs; keep them after the core deps so rebuilds only # redo this layer when ONLY the video stack changes. If you don't plan to # use config.video.enabled=true, you can comment this block out to speed # up builds and shrink the image. # # LightX2V (Wan2.2-Lightning inference framework) — installed from source # since there is no stable PyPI release yet. RUN python3.11 -m pip install --no-cache-dir \ "git+https://github.com/ModelTC/LightX2V.git@6db002f2755036b02bd0900bf9b41958bbfb4137" || \ echo "LightX2V install failed — config.video.enabled must stay false until fixed" # ^ Pinned to 2026-04-14: last commit before WorldMirrorRunner was added to # pipeline.py (which requires flash_attn + matplotlib) and before the # dummy_model NameError regression in vae_2_2.py. # # sgl-kernel (fp8 T5 encoder acceleration). The PyPI wheel lacks SM120 # (Blackwell) CUTLASS kernels; use SGLang's cu128 wheel index instead. # Our wan22.py patches fp8_scaled_mm → torch._scaled_mm at runtime for # Blackwell GPUs, but the sgl_kernel package itself must still be present. RUN python3.11 -m pip install --no-cache-dir --no-deps \ "sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.14.post1/sgl_kernel-0.3.14.post1%2Bcu128-cp310-abi3-manylinux2014_x86_64.whl" || \ echo "sgl-kernel install failed — fp8 T5 will fall back to bf16" # # MuseTalk (audio-driven lip-sync) — installed from the bhetherman/MuseTalk # fork checked in as a submodule at third_party/MuseTalk. The upstream repo # has no setup.py / pyproject.toml; our fork adds them so `pip install .` # just works. We deliberately do NOT install its requirements.txt (it pins # numpy==1.23.5, transformers==4.39.2, tensorflow==2.12.0 which conflict # with the rest of the stack) — instead we install its real runtime deps # explicitly here. COPY third_party/MuseTalk /opt/MuseTalk RUN python3.11 -m pip install --no-cache-dir --no-deps /opt/MuseTalk || \ echo "MuseTalk install failed — config.video.musetalk.enabled must stay false until fixed" RUN python3.11 -m pip install --no-cache-dir \ librosa einops omegaconf ffmpeg-python || \ echo "MuseTalk runtime deps install failed" # # LoRA directory (user drops .safetensors here; bind-mounted in compose). RUN mkdir -p /cache/loras COPY . . EXPOSE 8000 CMD ["python3.11", "run.py"]