44a10667c2
- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
95 lines
4.1 KiB
Docker
95 lines
4.1 KiB
Docker
FROM nvidia/cuda:12.8.0-cudnn-devel-ubuntu22.04
|
|
|
|
ENV DEBIAN_FRONTEND=noninteractive
|
|
ENV PYTHONUNBUFFERED=1
|
|
# HuggingFace model cache — mounted as a volume so models persist across runs
|
|
ENV HF_HOME=/cache/huggingface
|
|
# LoRA directory — users drop .safetensors files here and reference them
|
|
# from config.yml::video.loras. Bind-mounted via docker-compose.
|
|
ENV LORA_DIR=/cache/loras
|
|
# Prevent PyTorch/OpenMP/MKL thread pools from spin-waiting when idle.
|
|
# Without this, loading large models (ASR, LLM, Wan2.2) causes all CPU cores
|
|
# to busy-loop even with no connected clients, slowing the whole system.
|
|
ENV OMP_WAIT_POLICY=PASSIVE
|
|
ENV MKL_WAIT_POLICY=PASSIVE
|
|
ENV TOKENIZERS_PARALLELISM=false
|
|
|
|
RUN apt-get update && apt-get install -y \
|
|
python3.11 \
|
|
python3.11-dev \
|
|
python3.11-venv \
|
|
git \
|
|
ffmpeg \
|
|
curl \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
|
|
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
|
|
|
|
RUN ln -sf /usr/bin/python3.11 /usr/bin/python
|
|
|
|
WORKDIR /app
|
|
|
|
# Install PyTorch 2.7+ with CUDA 12.8 support (includes Blackwell/sm_120 support)
|
|
RUN python3.11 -m pip install --no-cache-dir \
|
|
torch \
|
|
--index-url https://download.pytorch.org/whl/cu128
|
|
|
|
# Install auto-gptq pre-built wheel for CUDA 12.8 (avoids compiling from source)
|
|
RUN python3.11 -m pip install --no-cache-dir \
|
|
"auto-gptq>=0.7.1" \
|
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu128/
|
|
|
|
# Install the rest of the app requirements
|
|
COPY requirements.txt .
|
|
RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
|
|
|
|
# Pre-download the spacy model that kokoro needs at runtime
|
|
RUN python3.11 -m spacy download en_core_web_sm
|
|
|
|
# --- Optional: avatar video stack -------------------------------------------
|
|
# These are heavy installs; keep them after the core deps so rebuilds only
|
|
# redo this layer when ONLY the video stack changes. If you don't plan to
|
|
# use config.video.enabled=true, you can comment this block out to speed
|
|
# up builds and shrink the image.
|
|
#
|
|
# LightX2V (Wan2.2-Lightning inference framework) — installed from source
|
|
# since there is no stable PyPI release yet.
|
|
RUN python3.11 -m pip install --no-cache-dir \
|
|
"git+https://github.com/ModelTC/LightX2V.git@6db002f2755036b02bd0900bf9b41958bbfb4137" || \
|
|
echo "LightX2V install failed — config.video.enabled must stay false until fixed"
|
|
# ^ Pinned to 2026-04-14: last commit before WorldMirrorRunner was added to
|
|
# pipeline.py (which requires flash_attn + matplotlib) and before the
|
|
# dummy_model NameError regression in vae_2_2.py.
|
|
#
|
|
# sgl-kernel (fp8 T5 encoder acceleration). The PyPI wheel lacks SM120
|
|
# (Blackwell) CUTLASS kernels; use SGLang's cu128 wheel index instead.
|
|
# Our wan22.py patches fp8_scaled_mm → torch._scaled_mm at runtime for
|
|
# Blackwell GPUs, but the sgl_kernel package itself must still be present.
|
|
RUN python3.11 -m pip install --no-cache-dir --no-deps \
|
|
"sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.14.post1/sgl_kernel-0.3.14.post1%2Bcu128-cp310-abi3-manylinux2014_x86_64.whl" || \
|
|
echo "sgl-kernel install failed — fp8 T5 will fall back to bf16"
|
|
#
|
|
# MuseTalk (audio-driven lip-sync) — installed from the bhetherman/MuseTalk
|
|
# fork checked in as a submodule at third_party/MuseTalk. The upstream repo
|
|
# has no setup.py / pyproject.toml; our fork adds them so `pip install .`
|
|
# just works. We deliberately do NOT install its requirements.txt (it pins
|
|
# numpy==1.23.5, transformers==4.39.2, tensorflow==2.12.0 which conflict
|
|
# with the rest of the stack) — instead we install its real runtime deps
|
|
# explicitly here.
|
|
COPY third_party/MuseTalk /opt/MuseTalk
|
|
RUN python3.11 -m pip install --no-cache-dir --no-deps /opt/MuseTalk || \
|
|
echo "MuseTalk install failed — config.video.musetalk.enabled must stay false until fixed"
|
|
RUN python3.11 -m pip install --no-cache-dir \
|
|
librosa einops omegaconf ffmpeg-python || \
|
|
echo "MuseTalk runtime deps install failed"
|
|
#
|
|
# LoRA directory (user drops .safetensors here; bind-mounted in compose).
|
|
RUN mkdir -p /cache/loras
|
|
|
|
COPY . .
|
|
|
|
EXPOSE 8000
|
|
|
|
CMD ["python3.11", "run.py"]
|