From 0305f1dccd18eac4b1df01d80bb888c72c51521f Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 8 Apr 2026 03:05:26 -0400 Subject: [PATCH] updates for docker take 1 --- .dockerignore | 8 +++++++ .gitignore | 1 + .gitmodules | 0 Dockerfile | 53 ++++++++++++++++++++++++++++++++++++++++++++++ docker-compose.yml | 18 ++++++++++++++++ requirements.txt | 7 +++++- server/models.py | 28 ++++++++++++++++++------ 7 files changed, 108 insertions(+), 7 deletions(-) create mode 100644 .dockerignore create mode 100644 .gitmodules create mode 100644 Dockerfile create mode 100644 docker-compose.yml diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7c2c8f0 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +.venv/ +__pycache__/ +*.pyc +*.pyo +.git/ +.gitignore +*.md +.env diff --git a/.gitignore b/.gitignore index 0e5ac79..aa47104 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .venv +.claude __pycache__ \ No newline at end of file diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..e69de29 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..049b53f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +# HuggingFace model cache — mounted as a volume so models persist across runs +ENV HF_HOME=/cache/huggingface + +RUN apt-get update && apt-get install -y \ + python3.11 \ + python3.11-dev \ + python3.11-venv \ + git \ + ffmpeg \ + curl \ + cmake \ + ninja-build \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + +# Bootstrap pip for python3.11 (Debian disables ensurepip for system Python) +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11 + +RUN ln -sf /usr/bin/python3.11 /usr/bin/python + +WORKDIR /app + +# Build PyTorch from source with Blackwell (sm_120) support +RUN git clone --depth 1 https://github.com/pytorch/pytorch.git /tmp/pytorch && \ + cd /tmp/pytorch && \ + git submodule update --init --recursive && \ + TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;9.0a;12.0" \ + python3.11 setup.py install && \ + cd / && rm -rf /tmp/pytorch + +# Install torchvision and torchaudio with CUDA 12.1 support +RUN python3.11 -m pip install --no-cache-dir \ + torchvision torchaudio \ + --index-url https://download.pytorch.org/whl/cu121 + +# Install auto-gptq pre-built wheel for CUDA 12.1 (avoids compiling from source) +RUN python3.11 -m pip install --no-cache-dir \ + "auto-gptq>=0.7.1" \ + --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu121/ + +# Install the rest of the app requirements +COPY requirements.txt . +RUN python3.11 -m pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 8000 + +CMD ["python3.11", "run.py"] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..14b9710 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,18 @@ +services: + voice-chat: + build: . + ports: + - "8000:8000" + volumes: + # Cache models on the host so they survive container rebuilds + - huggingface-cache:/cache/huggingface + deploy: + resources: + reservations: + devices: + - driver: nvidia + count: 1 + capabilities: [gpu] + +volumes: + huggingface-cache: diff --git a/requirements.txt b/requirements.txt index bc1e861..8c90877 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,11 @@ -torch>=2.5.0 +# torch and auto-gptq are installed in the Dockerfile with GPU-specific index URLs. +# For local dev outside Docker: pip install torch --index-url https://download.pytorch.org/whl/cu121 transformers==4.57.6 +optimum>=1.19 +compressed-tensors>=0.5.0 silero-vad>=5.1 +qwen-asr==0.0.6 +kokoro==0.9.4 fastapi>=0.115.0 uvicorn[standard]>=0.30.0 numpy diff --git a/server/models.py b/server/models.py index e02d521..35e059a 100644 --- a/server/models.py +++ b/server/models.py @@ -9,6 +9,20 @@ from server.tts import TTSEngine log = logging.getLogger(__name__) +def get_device(): + """Get the best available device (CUDA if available and working, otherwise CPU).""" + if torch.cuda.is_available(): + try: + # Test CUDA availability + torch.zeros(1, device="cuda:0") + log.info("Using CUDA device") + return "cuda:0" + except RuntimeError as e: + log.warning(f"CUDA available but error occurred: {e}. Falling back to CPU.") + log.info("Using CPU device") + return "cpu" + + class ModelManager: """Loads and holds all models. Initialized once at server startup.""" @@ -37,28 +51,30 @@ class ModelManager: log.info("Loading Qwen3-ASR-0.6B (transformers backend)...") from qwen_asr import Qwen3ASRModel + device = get_device() asr_model = Qwen3ASRModel.from_pretrained( "Qwen/Qwen3-ASR-0.6B", dtype=torch.bfloat16, - device_map="cuda:0", + device_map=device, max_new_tokens=4096, ) self.asr_engine = ASREngine(asr_model) log.info("Qwen3-ASR-0.6B loaded.") def _load_llm(self): - log.info("Loading Qwen3-0.6B-Instruct...") + log.info("Loading Qwen3-4B (GPTQ 4-bit)...") from transformers import AutoModelForCausalLM, AutoTokenizer - model_name = "Qwen/Qwen3-0.6B" + model_name = "Qwen/Qwen3.5-0.8B" + tokenizer = AutoTokenizer.from_pretrained(model_name) + device = get_device() model = AutoModelForCausalLM.from_pretrained( model_name, - torch_dtype=torch.bfloat16, - device_map="cuda:0", + device_map=device, ) self.llm_engine = LLMEngine(model, tokenizer) - log.info("Qwen3-0.6B-Instruct loaded.") + log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).") def _load_tts(self): log.info("Loading Kokoro TTS...")