updates for docker take 1

2026-04-08 03:05:26 -04:00
parent ce41bca422
commit 0305f1dccd
7 changed files with 108 additions and 7 deletions
@@ -0,0 +1,8 @@
+.venv/
+__pycache__/
+*.pyc
+*.pyo
+.git/
+.gitignore
+*.md
+.env
@@ -1,2 +1,3 @@
 .venv
+.claude
 __pycache__
@@ -0,0 +1,53 @@
+FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# HuggingFace model cache — mounted as a volume so models persist across runs
+ENV HF_HOME=/cache/huggingface
+
+RUN apt-get update && apt-get install -y \
+    python3.11 \
+    python3.11-dev \
+    python3.11-venv \
+    git \
+    ffmpeg \
+    curl \
+    cmake \
+    ninja-build \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+
+# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python)
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
+
+RUN ln -sf /usr/bin/python3.11 /usr/bin/python
+
+WORKDIR /app
+
+# Build PyTorch from source with Blackwell (sm_120) support
+RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \
+    cd /tmp/pytorch && \
+    git submodule update --init --recursive && \
+    TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \
+    python3.11 setup.py install && \
+    cd / && rm -rf /tmp/pytorch
+
+# Install torchvision and torchaudio with CUDA 12.1 support
+RUN python3.11 -m pip install --no-cache-dir \
+    torchvision torchaudio \
+    --index-url https://download.pytorch.org/whl/cu121
+
+# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source)
+RUN python3.11 -m pip install --no-cache-dir \
+    "auto-gptq>=0.7.1" \
+    --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/
+
+# Install the rest of the app requirements
+COPY requirements.txt .
+RUN python3.11 -m pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8000
+
+CMD ["python3.11", "run.py"]
@@ -0,0 +1,18 @@
+services:
+  voice-chat:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      # Cache models on the host so they survive container rebuilds
+      - huggingface-cache:/cache/huggingface
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+
+volumes:
+  huggingface-cache:
@@ -1,6 +1,11 @@
-torch>=2.5.0
+# torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs.
+# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121
 transformers==4.57.6
+optimum>=1.19
+compressed-tensors>=0.5.0
 silero-vad>=5.1
+qwen-asr==0.0.6
+kokoro==0.9.4
 fastapi>=0.115.0
 uvicorn[standard]>=0.30.0
 numpy
@@ -9,6 +9,20 @@ from server.tts import TTSEngine
 log = logging.getLogger(__name__)


+def get_device():
+    """Get the best available device (CUDA if available and working, otherwise CPU)."""
+    if torch.cuda.is_available():
+        try:
+            # Test CUDA availability
+            torch.zeros(1, device="cuda:0")
+            log.info("Using CUDA device")
+            return "cuda:0"
+        except RuntimeError as e:
+            log.warning(f"CUDA available but error occurred: {e}. Falling back to CPU.")
+    log.info("Using CPU device")
+    return "cpu"
+
+
 class ModelManager:
    """Loads and holds all models. Initialized once at server startup."""

@@ -37,28 +51,30 @@ class ModelManager:
        log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
        from qwen_asr import Qwen3ASRModel

+        device = get_device()
        asr_model = Qwen3ASRModel.from_pretrained(
            "Qwen/Qwen3-ASR-0.6B",
            dtype=torch.bfloat16,
-            device_map="cuda:0",
+            device_map=device,
            max_new_tokens=4096,
        )
        self.asr_engine = ASREngine(asr_model)
        log.info("Qwen3-ASR-0.6B loaded.")

    def _load_llm(self):
-        log.info("Loading Qwen3-0.6B-Instruct...")
+        log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
        from transformers import AutoModelForCausalLM, AutoTokenizer

-        model_name = "Qwen/Qwen3-0.6B"
+        model_name = "Qwen/Qwen3.5-0.8B"
+        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        device = get_device()
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="cuda:0",
+            device_map=device,
        )
        self.llm_engine = LLMEngine(model, tokenizer)
-        log.info("Qwen3-0.6B-Instruct loaded.")
+        log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")

    def _load_tts(self):
        log.info("Loading Kokoro TTS...")