updates for docker take 1

2026-04-08 03:05:26 -04:00
parent ce41bca422
commit 0305f1dccd
7 changed files with 108 additions and 7 deletions
@@ -0,0 +1,8 @@
 .venv/
 __pycache__/
 *.pyc
 *.pyo
 .git/
 .gitignore
 *.md
 .env
@@ -1,2 +1,3 @@
 .venv
 .claude
 __pycache__
@@ -0,0 +1,53 @@
 FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PYTHONUNBUFFERED=1
 # HuggingFace model cache — mounted as a volume so models persist across runs
 ENV HF_HOME=/cache/huggingface
 RUN apt-get update && apt-get install -y \
    python3.11 \
    python3.11-dev \
    python3.11-venv \
    git \
    ffmpeg \
    curl \
    cmake \
    ninja-build \
    build-essential \
    && rm -rf /var/lib/apt/lists/*
 # Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 RUN ln -sf /usr/bin/python3.11 /usr/bin/python
 WORKDIR /app
 # Build PyTorch from source with Blackwell (sm_120) support
 RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
    cd /tmp/pytorch && \
    git submodule update --init --recursive && \
    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
    python3.11 setup.py install && \
    cd / && rm -rf /tmp/pytorch
 # Install torchvision and torchaudio with CUDA 12.1 support
 RUN python3.11 -m pip install --no-cache-dir \
    torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/cu121
 # Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
 RUN python3.11 -m pip install --no-cache-dir \
    "auto-gptq>=0.7.1" \
    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
 # Install the rest of the app requirements
 COPY requirements.txt .
 RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
 COPY . .
 EXPOSE 8000
 CMD ["python3.11", "run.py"]
@@ -0,0 +1,18 @@
 services:
  voice-chat:
    build: .
    ports:
      - "8000:8000"
    volumes:
      # Cache models on the host so they survive container rebuilds
      - huggingface-cache:/cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
 volumes:
  huggingface-cache:
@@ -1,6 +1,11 @@
-torch>=2.5.0
+# torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
 # For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
 transformers==4.57.6
 optimum>=1.19
 compressed-tensors>=0.5.0
 silero-vad>=5.1
 qwen-asr==0.0.6
 kokoro==0.9.4
 fastapi>=0.115.0
 uvicorn[standard]>=0.30.0
 numpy
@@ -9,6 +9,20 @@ from server.tts import TTSEngine
 log = logging.getLogger(__name__)
 def get_device():
    """Get the best available device (CUDA if available and working, otherwise CPU)."""
    if torch.cuda.is_available():
        try:
            # Test CUDA availability
            torch.zeros(1, device="cuda:0")
            log.info("Using CUDA device")
            return "cuda:0"
        except RuntimeError as e:
            log.warning(f"CUDA available but error occurred: {e}. Falling back to CPU.")
    log.info("Using CPU device")
    return "cpu"
 class ModelManager:
    """Loads and holds all models. Initialized once at server startup."""
@@ -37,28 +51,30 @@ class ModelManager:
        log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
        from qwen_asr import Qwen3ASRModel
        device = get_device()
        asr_model = Qwen3ASRModel.from_pretrained(
            "Qwen/Qwen3-ASR-0.6B",
            dtype=torch.bfloat16,
-            device_map="cuda:0",
+            device_map=device,
            max_new_tokens=4096,
        )
        self.asr_engine = ASREngine(asr_model)
        log.info("Qwen3-ASR-0.6B loaded.")
    def _load_llm(self):
-        log.info("Loading Qwen3-0.6B-Instruct...")
+        log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
        from transformers import AutoModelForCausalLM, AutoTokenizer
-        model_name = "Qwen/Qwen3-0.6B"
+        model_name = "Qwen/Qwen3.5-0.8B"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        device = get_device()
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
-            torch_dtype=torch.bfloat16,
+            device_map=device,
            device_map="cuda:0",
        )
        self.llm_engine = LLMEngine(model, tokenizer)
-        log.info("Qwen3-0.6B-Instruct loaded.")
+        log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")
    def _load_tts(self):
        log.info("Loading Kokoro TTS...")