initial commit

2026-04-07 03:58:35 -04:00
commit ce41bca422
17 changed files with 1184 additions and 0 deletions
@@ -0,0 +1,2 @@
+.venv
+__pycache__
@@ -0,0 +1,9 @@
+torch>=2.5.0
+transformers==4.57.6
+silero-vad>=5.1
+fastapi>=0.115.0
+uvicorn[standard]>=0.30.0
+numpy
+soundfile
+scipy
+python-multipart
@@ -0,0 +1,10 @@
+import uvicorn
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "server.main:app",
+        host="0.0.0.0",
+        port=8000,
+        reload=False,
+        log_level="info",
+    )
@@ -0,0 +1,25 @@
+import numpy as np
+
+
+class ASREngine:
+    """Wraps Qwen3-ASR for speech-to-text transcription."""
+
+    def __init__(self, model):
+        self.model = model
+
+    def transcribe(self, audio_16k: np.ndarray) -> str:
+        """Transcribe a complete utterance.
+
+        Args:
+            audio_16k: Float32 numpy array at 16kHz sample rate.
+
+        Returns:
+            Transcribed text string.
+        """
+        results = self.model.transcribe(
+            audio=(audio_16k, 16000),
+            language=None,  # auto-detect
+        )
+        if results and results[0].text:
+            return results[0].text.strip()
+        return ""
@@ -0,0 +1,63 @@
+import numpy as np
+from scipy.signal import resample_poly
+from math import gcd
+
+
+def pcm_bytes_to_float32(pcm_bytes: bytes, dtype=np.int16) -> np.ndarray:
+    """Convert raw PCM bytes (16-bit signed int) to float32 in [-1, 1]."""
+    audio = np.frombuffer(pcm_bytes, dtype=dtype)
+    return audio.astype(np.float32) / 32768.0
+
+
+def float32_to_pcm_bytes(audio) -> bytes:
+    """Convert float32 audio in [-1, 1] to 16-bit PCM bytes.
+
+    Accepts numpy arrays or PyTorch tensors.
+    """
+    if not isinstance(audio, np.ndarray):
+        audio = audio.detach().cpu().numpy()
+    clamped = np.clip(audio, -1.0, 1.0)
+    return (clamped * 32767).astype(np.int16).tobytes()
+
+
+def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
+    """Resample audio from orig_sr to target_sr using polyphase filtering."""
+    if orig_sr == target_sr:
+        return audio
+    divisor = gcd(orig_sr, target_sr)
+    up = target_sr // divisor
+    down = orig_sr // divisor
+    return resample_poly(audio, up, down).astype(audio.dtype)
+
+
+def split_sentences(text: str) -> tuple[list[str], str]:
+    """Split text into completed sentences and a remaining buffer.
+
+    Returns (sentences, remaining_buffer).
+    Splits on sentence-ending punctuation followed by whitespace.
+    """
+    sentences = []
+    buffer = text
+    terminators = ".!?"
+
+    i = 0
+    start = 0
+    while i < len(buffer):
+        if buffer[i] in terminators:
+            # Look ahead for whitespace or end of string
+            end = i + 1
+            while end < len(buffer) and buffer[end] in terminators:
+                end += 1
+            if end >= len(buffer) or buffer[end] == " " or buffer[end] == "\n":
+                sentence = buffer[start:end].strip()
+                if sentence:
+                    sentences.append(sentence)
+                start = end
+                i = end
+            else:
+                i += 1
+        else:
+            i += 1
+
+    remaining = buffer[start:].strip()
+    return sentences, remaining
@@ -0,0 +1,83 @@
+import logging
+import threading
+from typing import AsyncIterator
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from server.audio_utils import split_sentences
+
+log = logging.getLogger(__name__)
+
+
+class LLMEngine:
+    """Wraps Qwen3 for conversation generation."""
+
+    SYSTEM_PROMPT = (
+        "You are a helpful voice assistant. Keep your responses concise and natural "
+        "for spoken conversation. Respond in 1-3 short sentences. "
+        "Do not use markdown, bullet points, code blocks, emojis, or any "
+        "formatting that doesn't work in speech."
+    )
+
+    def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
+        self.model = model
+        self.tokenizer = tokenizer
+
+    def _build_inputs(self, messages: list[dict]):
+        """Build input token ids using the model's chat template."""
+        chat_messages = [{"role": "system", "content": self.SYSTEM_PROMPT}]
+        for msg in messages:
+            chat_messages.append({"role": msg["role"], "content": msg["content"]})
+
+        text = self.tokenizer.apply_chat_template(
+            chat_messages,
+            tokenize=False,
+            add_generation_prompt=True,
+            enable_thinking=False,
+        )
+        return self.tokenizer(text, return_tensors="pt").to(self.model.device)
+
+    def generate(self, messages: list[dict], max_new_tokens: int = 256) -> str:
+        """Generate a complete response (blocking)."""
+        inputs = self._build_inputs(messages)
+        input_len = inputs["input_ids"].shape[1]
+
+        with torch.no_grad():
+            output_ids = self.model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                temperature=0.7,
+                top_p=0.9,
+                do_sample=True,
+                repetition_penalty=1.2,
+            )
+
+        # Decode only the generated tokens (skip prompt)
+        new_ids = output_ids[0][input_len:]
+        response = self.tokenizer.decode(new_ids, skip_special_tokens=True).strip()
+        log.info(f"LLM response: {response}")
+        return response
+
+    async def generate_sentences(
+        self,
+        messages: list[dict],
+        cancel_event: threading.Event | None = None,
+    ) -> AsyncIterator[str]:
+        """Generate response and yield it sentence by sentence for TTS pipelining."""
+        import asyncio
+
+        response = await asyncio.to_thread(self.generate, messages)
+
+        if cancel_event and cancel_event.is_set():
+            return
+
+        # Split into sentences and yield each
+        sentences, remainder = split_sentences(response)
+        for sentence in sentences:
+            if cancel_event and cancel_event.is_set():
+                return
+            yield sentence
+
+        if remainder:
+            yield remainder
@@ -0,0 +1,87 @@
+import json
+import logging
+import os
+from contextlib import asynccontextmanager
+
+import numpy as np
+from fastapi import FastAPI, UploadFile, WebSocket, WebSocketDisconnect
+from fastapi.params import Form
+from fastapi.responses import FileResponse
+from fastapi.staticfiles import StaticFiles
+
+from server.audio_utils import pcm_bytes_to_float32
+from server.models import ModelManager
+from server.pipeline import ConversationSession
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s")
+log = logging.getLogger(__name__)
+
+REFERENCE_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "reference_audio")
+STATIC_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "static")
+
+model_mgr = ModelManager()
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    log.info("Starting model loading...")
+    model_mgr.load_all()
+    log.info("Server ready.")
+    yield
+    log.info("Shutting down.")
+
+
+app = FastAPI(lifespan=lifespan)
+app.mount("/static", StaticFiles(directory=STATIC_DIR), name="static")
+
+
+@app.get("/")
+async def index():
+    return FileResponse(os.path.join(STATIC_DIR, "index.html"))
+
+
+@app.post("/api/set-voice")
+async def set_voice(voice: str = Form(...), lang: str = Form("a")):
+    """Change the TTS voice."""
+    model_mgr.tts_engine.set_voice(voice, lang)
+    return {"status": "ok", "voice": voice}
+
+
+@app.websocket("/ws/chat")
+async def websocket_chat(ws: WebSocket):
+    await ws.accept()
+    log.info("WebSocket client connected.")
+
+    async def send_json(data: dict):
+        await ws.send_text(json.dumps(data))
+
+    async def send_bytes(data: bytes):
+        await ws.send_bytes(data)
+
+    session = ConversationSession(model_mgr, send_json, send_bytes)
+    await session.start()
+
+    try:
+        while True:
+            message = await ws.receive()
+
+            if "bytes" in message:
+                pcm_data = message["bytes"]
+                chunk = pcm_bytes_to_float32(pcm_data)
+                await session.handle_audio_chunk(chunk)
+
+            elif "text" in message:
+                try:
+                    msg = json.loads(message["text"])
+                except json.JSONDecodeError:
+                    continue
+
+                if msg.get("type") == "interrupt":
+                    await session.interrupt()
+
+    except WebSocketDisconnect:
+        log.info("WebSocket client disconnected.")
+    except Exception:
+        log.exception("WebSocket error")
+    finally:
+        await session.stop()
@@ -0,0 +1,70 @@
+import logging
+import torch
+
+from server.vad import StreamingVAD
+from server.asr import ASREngine
+from server.llm import LLMEngine
+from server.tts import TTSEngine
+
+log = logging.getLogger(__name__)
+
+
+class ModelManager:
+    """Loads and holds all models. Initialized once at server startup."""
+
+    def __init__(self):
+        self.vad_model = None
+        self.asr_engine: ASREngine | None = None
+        self.llm_engine: LLMEngine | None = None
+        self.tts_engine: TTSEngine | None = None
+
+    def load_all(self):
+        """Load all models sequentially. Call from the main process."""
+        self._load_vad()
+        self._load_asr()
+        self._load_llm()
+        self._load_tts()
+        log.info("All models loaded successfully.")
+
+    def _load_vad(self):
+        log.info("Loading Silero VAD...")
+        from silero_vad import load_silero_vad
+
+        self.vad_model = load_silero_vad()
+        log.info("Silero VAD loaded (CPU).")
+
+    def _load_asr(self):
+        log.info("Loading Qwen3-ASR-0.6B (transformers backend)...")
+        from qwen_asr import Qwen3ASRModel
+
+        asr_model = Qwen3ASRModel.from_pretrained(
+            "Qwen/Qwen3-ASR-0.6B",
+            dtype=torch.bfloat16,
+            device_map="cuda:0",
+            max_new_tokens=4096,
+        )
+        self.asr_engine = ASREngine(asr_model)
+        log.info("Qwen3-ASR-0.6B loaded.")
+
+    def _load_llm(self):
+        log.info("Loading Qwen3-0.6B-Instruct...")
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        model_name = "Qwen/Qwen3-0.6B"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.bfloat16,
+            device_map="cuda:0",
+        )
+        self.llm_engine = LLMEngine(model, tokenizer)
+        log.info("Qwen3-0.6B-Instruct loaded.")
+
+    def _load_tts(self):
+        log.info("Loading Kokoro TTS...")
+        self.tts_engine = TTSEngine()
+        log.info("Kokoro TTS loaded.")
+
+    def create_vad(self) -> StreamingVAD:
+        """Create a new StreamingVAD instance for a client session."""
+        return StreamingVAD(self.vad_model)
@@ -0,0 +1,164 @@
+import asyncio
+import logging
+import queue
+import threading
+
+import numpy as np
+
+from server.audio_utils import float32_to_pcm_bytes
+from server.models import ModelManager
+from server.vad import StreamingVAD
+
+log = logging.getLogger(__name__)
+
+_SENTINEL = None
+
+
+class ConversationSession:
+    """Manages a single client's voice conversation pipeline.
+
+    Orchestrates: VAD -> ASR -> LLM -> TTS streaming with barge-in support.
+    """
+
+    def __init__(self, models: ModelManager, send_json, send_bytes):
+        self.models = models
+        self.send_json = send_json
+        self.send_bytes = send_bytes
+
+        self.vad: StreamingVAD = models.create_vad()
+        self.conversation_history: list[dict] = []
+        self.cancel_event = threading.Event()
+        self.is_responding = False
+        self._response_task: asyncio.Task | None = None
+
+    async def start(self):
+        await self.send_json({"type": "status", "state": "listening"})
+
+    async def stop(self):
+        self.cancel_event.set()
+        if self._response_task and not self._response_task.done():
+            self._response_task.cancel()
+
+    async def handle_audio_chunk(self, chunk_16k: np.ndarray):
+        utterance = self.vad.process_chunk(chunk_16k)
+
+        if utterance is not None:
+            if self.is_responding:
+                await self._interrupt()
+            # Launch response pipeline as a background task so we don't block receives
+            self._response_task = asyncio.create_task(self._process_utterance(utterance))
+        elif self.vad.is_speaking and self.is_responding:
+            await self._interrupt()
+
+    async def interrupt(self):
+        """Public interrupt method for WebSocket text messages."""
+        if self.is_responding:
+            await self._interrupt()
+
+    async def _interrupt(self):
+        log.info("Barge-in: cancelling response.")
+        self.cancel_event.set()
+        self.is_responding = False
+        # Tell client to stop audio immediately
+        try:
+            await self.send_json({"type": "interrupt"})
+        except Exception:
+            pass
+
+    async def _process_utterance(self, audio_16k: np.ndarray):
+        """Full pipeline: ASR -> LLM -> TTS streaming."""
+        self.is_responding = True
+        self.cancel_event.clear()
+
+        # ASR
+        await self.send_json({"type": "status", "state": "thinking"})
+        text = await asyncio.to_thread(self.models.asr_engine.transcribe, audio_16k)
+
+        if not text:
+            log.info("ASR returned empty text, resuming listening.")
+            self.is_responding = False
+            await self.send_json({"type": "status", "state": "listening"})
+            return
+
+        await self.send_json({"type": "transcript", "text": text, "final": True})
+        log.info(f"User: {text}")
+        self.conversation_history.append({"role": "user", "content": text})
+
+        if self.cancel_event.is_set():
+            self.is_responding = False
+            return
+
+        # LLM
+        log.info(f"Conversation history ({len(self.conversation_history)} messages): "
+                 + str([m['content'][:50] for m in self.conversation_history]))
+        response = await asyncio.to_thread(
+            self.models.llm_engine.generate, self.conversation_history
+        )
+
+        if self.cancel_event.is_set():
+            self.is_responding = False
+            return
+
+        # TTS - stream chunks with per-sentence text
+        await self.send_json({"type": "status", "state": "speaking"})
+        chunk_queue = queue.Queue()
+
+        def _tts_worker():
+            try:
+                for graphemes, _ps, audio in self.models.tts_engine.pipeline(
+                    response, voice=self.models.tts_engine.voice
+                ):
+                    if self.cancel_event.is_set():
+                        break
+                    if audio is not None and len(audio) > 0:
+                        chunk_queue.put((graphemes, audio))
+            except Exception:
+                log.exception("TTS generation error")
+            finally:
+                chunk_queue.put(_SENTINEL)
+
+        tts_thread = threading.Thread(target=_tts_worker, daemon=True)
+        tts_thread.start()
+
+        spoken_text = ""
+        while True:
+            try:
+                item = await asyncio.to_thread(chunk_queue.get, timeout=10.0)
+            except Exception:
+                break
+
+            if item is _SENTINEL:
+                break
+            if self.cancel_event.is_set():
+                break
+
+            sentence_text, audio = item
+            spoken_text += sentence_text
+
+            await self.send_json({"type": "response_text", "text": sentence_text, "final": False})
+            pcm_bytes = float32_to_pcm_bytes(audio)
+            try:
+                await self.send_bytes(pcm_bytes)
+            except Exception:
+                log.warning("Failed to send audio, client disconnected.")
+                self.cancel_event.set()
+                break
+
+        tts_thread.join(timeout=2.0)
+
+        # Save only what was actually spoken
+        if spoken_text.strip():
+            self.conversation_history.append(
+                {"role": "assistant", "content": spoken_text.strip()}
+            )
+        elif self.conversation_history and self.conversation_history[-1]["role"] == "user":
+            self.conversation_history.pop()
+
+        if not self.cancel_event.is_set():
+            await self.send_json({"type": "response_text", "text": "", "final": True})
+
+        self.is_responding = False
+        try:
+            await self.send_json({"type": "status", "state": "listening"})
+        except Exception:
+            pass
@@ -0,0 +1,38 @@
+import logging
+from typing import Iterator
+
+import numpy as np
+
+log = logging.getLogger(__name__)
+
+DEFAULT_VOICE = "af_heart"
+DEFAULT_LANG = "a"  # American English
+
+
+class TTSEngine:
+    """Wraps Kokoro TTS for fast streaming text-to-speech."""
+
+    def __init__(self):
+        from kokoro import KPipeline
+
+        self.pipeline = KPipeline(lang_code=DEFAULT_LANG)
+        self.voice = DEFAULT_VOICE
+        self.sample_rate = 24000
+
+    def set_voice(self, voice: str, lang_code: str = "a"):
+        """Change the voice."""
+        from kokoro import KPipeline
+
+        self.voice = voice
+        self.pipeline = KPipeline(lang_code=lang_code)
+        log.info(f"Voice set to: {voice} (lang: {lang_code})")
+
+    def synthesize_stream(self, text: str) -> Iterator[np.ndarray]:
+        """Yield audio chunks as they are generated.
+
+        Each chunk is a float32 numpy array at self.sample_rate (24kHz).
+        Kokoro internally splits text into sentences and yields per-sentence audio.
+        """
+        for _gs, _ps, audio in self.pipeline(text, voice=self.voice):
+            if audio is not None and len(audio) > 0:
+                yield audio
@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+
+
+class StreamingVAD:
+    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+
+    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
+        from silero_vad import VADIterator
+
+        self.iterator = VADIterator(
+            model,
+            sampling_rate=16000,
+            threshold=threshold,
+            min_silence_duration_ms=min_silence_ms,
+        )
+        self.audio_buffer: list[np.ndarray] = []
+        self.is_speaking = False
+
+    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
+        """Feed a 512-sample chunk at 16kHz.
+
+        Returns the complete utterance as a numpy array when speech ends,
+        or None if still accumulating.
+        """
+        tensor = torch.from_numpy(chunk_16k).float()
+        speech_dict = self.iterator(tensor, return_seconds=False)
+
+        if speech_dict:
+            if "start" in speech_dict:
+                self.is_speaking = True
+                self.audio_buffer = []
+            if "end" in speech_dict:
+                self.is_speaking = False
+                if self.audio_buffer:
+                    result = np.concatenate(self.audio_buffer)
+                    self.audio_buffer = []
+                    self.iterator.reset_states()
+                    return result
+                self.iterator.reset_states()
+                return None
+
+        if self.is_speaking:
+            self.audio_buffer.append(chunk_16k.copy())
+
+        return None
+
+    def reset(self):
+        """Reset VAD state for a new conversation turn."""
+        self.audio_buffer = []
+        self.is_speaking = False
+        self.iterator.reset_states()
@@ -0,0 +1,305 @@
+// --- State ---
+let ws = null;
+let audioCtx = null;
+let micStream = null;
+let workletNode = null;
+let micActive = false;
+let nextPlayTime = 0;
+let isPlaying = false;
+
+const PLAYBACK_SR = 24000; // TTS output sample rate
+const MIC_SR = 16000;
+const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
+const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
+let bargeInCount = 0;
+
+const chatArea = document.getElementById("chat-area");
+const statusBadge = document.getElementById("status-badge");
+const micBtn = document.getElementById("mic-btn");
+
+// --- WebSocket ---
+
+function connectWS() {
+  const proto = location.protocol === "https:" ? "wss:" : "ws:";
+  ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
+  ws.binaryType = "arraybuffer";
+
+  ws.onopen = () => {
+    setStatus("listening");
+  };
+
+  ws.onclose = () => {
+    setStatus("disconnected");
+    setTimeout(connectWS, 2000);
+  };
+
+  ws.onerror = () => {
+    ws.close();
+  };
+
+  ws.onmessage = (event) => {
+    if (event.data instanceof ArrayBuffer) {
+      playAudioChunk(event.data);
+    } else {
+      handleJSON(JSON.parse(event.data));
+    }
+  };
+}
+
+function handleJSON(msg) {
+  switch (msg.type) {
+    case "status":
+      setStatus(msg.state);
+      break;
+
+    case "interrupt":
+      stopPlayback();
+      // Trim the assistant message to what was spoken, then finalize
+      finalizeAssistantMessage();
+      break;
+
+    case "transcript":
+      addMessage("user", msg.text);
+      break;
+
+    case "response_text":
+      if (msg.final) {
+        finalizeAssistantMessage();
+      } else {
+        appendAssistantText(msg.text);
+      }
+      break;
+  }
+}
+
+// --- Status ---
+
+function setStatus(state) {
+  statusBadge.textContent =
+    state === "listening"
+      ? "Listening"
+      : state === "thinking"
+      ? "Thinking..."
+      : state === "speaking"
+      ? "Speaking"
+      : state === "disconnected"
+      ? "Disconnected"
+      : state;
+  statusBadge.className = state;
+}
+
+// --- Chat Messages ---
+
+let currentAssistantEl = null;
+let currentAssistantText = "";
+
+function addMessage(role, text) {
+  const el = document.createElement("div");
+  el.className = `message ${role}`;
+  el.textContent = text;
+  chatArea.appendChild(el);
+  chatArea.scrollTop = chatArea.scrollHeight;
+}
+
+function appendAssistantText(text) {
+  if (!currentAssistantEl) {
+    currentAssistantEl = document.createElement("div");
+    currentAssistantEl.className = "message assistant";
+    chatArea.appendChild(currentAssistantEl);
+    currentAssistantText = "";
+  }
+  currentAssistantText += (currentAssistantText ? " " : "") + text;
+  currentAssistantEl.textContent = currentAssistantText;
+  chatArea.scrollTop = chatArea.scrollHeight;
+}
+
+function finalizeAssistantMessage() {
+  currentAssistantEl = null;
+  currentAssistantText = "";
+}
+
+// --- Audio Playback ---
+
+let activeSources = [];
+
+function getPlaybackCtx() {
+  if (!audioCtx || audioCtx.state === "closed") {
+    audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
+  }
+  return audioCtx;
+}
+
+function playAudioChunk(arrayBuffer) {
+  const ctx = getPlaybackCtx();
+  const int16 = new Int16Array(arrayBuffer);
+  const float32 = new Float32Array(int16.length);
+  for (let i = 0; i < int16.length; i++) {
+    float32[i] = int16[i] / 32768;
+  }
+
+  const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
+  buffer.getChannelData(0).set(float32);
+
+  const source = ctx.createBufferSource();
+  source.buffer = buffer;
+  source.connect(ctx.destination);
+
+  activeSources.push(source);
+  isPlaying = true;
+  source.onended = () => {
+    activeSources = activeSources.filter((s) => s !== source);
+    if (activeSources.length === 0) {
+      isPlaying = false;
+      bargeInCount = 0;
+    }
+  };
+
+  const now = ctx.currentTime;
+  if (nextPlayTime < now) {
+    nextPlayTime = now + 0.01;
+  }
+  source.start(nextPlayTime);
+  nextPlayTime += buffer.duration;
+}
+
+function stopPlayback() {
+  for (const source of activeSources) {
+    try {
+      source.stop();
+    } catch (_) {}
+  }
+  activeSources = [];
+  nextPlayTime = 0;
+  isPlaying = false;
+  bargeInCount = 0;
+}
+
+// --- Microphone ---
+
+async function toggleMic() {
+  if (micActive) {
+    stopMic();
+  } else {
+    await startMic();
+  }
+}
+
+async function startMic() {
+  try {
+    // Ensure playback context exists (needed for user gesture)
+    getPlaybackCtx();
+    if (audioCtx.state === "suspended") {
+      await audioCtx.resume();
+    }
+
+    micStream = await navigator.mediaDevices.getUserMedia({
+      audio: {
+        sampleRate: MIC_SR,
+        channelCount: 1,
+        echoCancellation: true,
+        noiseSuppression: true,
+        autoGainControl: true,
+      },
+    });
+
+    // Create a separate context at 16kHz for mic capture
+    const micCtx = new AudioContext({ sampleRate: MIC_SR });
+    const source = micCtx.createMediaStreamSource(micStream);
+
+    await micCtx.audioWorklet.addModule("/static/processor.js");
+    workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
+    source.connect(workletNode);
+
+    workletNode.port.onmessage = (e) => {
+      if (ws && ws.readyState === WebSocket.OPEN) {
+        ws.send(e.data);
+
+        // Client-side barge-in: detect mic energy while playing
+        if (isPlaying) {
+          const samples = new Int16Array(e.data);
+          let sum = 0;
+          for (let i = 0; i < samples.length; i++) {
+            const s = samples[i] / 32768;
+            sum += s * s;
+          }
+          const rms = Math.sqrt(sum / samples.length);
+
+          if (rms > BARGE_IN_THRESHOLD) {
+            bargeInCount++;
+            if (bargeInCount >= BARGE_IN_FRAMES) {
+              // User is speaking over the assistant - interrupt
+              stopPlayback();
+              finalizeAssistantMessage();
+              ws.send(JSON.stringify({ type: "interrupt" }));
+              isPlaying = false;
+              bargeInCount = 0;
+            }
+          } else {
+            bargeInCount = 0;
+          }
+        }
+      }
+    };
+
+    // Store for cleanup
+    workletNode._micCtx = micCtx;
+
+    micActive = true;
+    micBtn.classList.add("active");
+
+    // Connect WebSocket if not already
+    if (!ws || ws.readyState !== WebSocket.OPEN) {
+      connectWS();
+    }
+  } catch (err) {
+    console.error("Mic access failed:", err);
+    alert("Could not access microphone. Please allow mic permissions.");
+  }
+}
+
+function stopMic() {
+  if (workletNode) {
+    workletNode.disconnect();
+    if (workletNode._micCtx) {
+      workletNode._micCtx.close();
+    }
+    workletNode = null;
+  }
+  if (micStream) {
+    micStream.getTracks().forEach((t) => t.stop());
+    micStream = null;
+  }
+  micActive = false;
+  micBtn.classList.remove("active");
+}
+
+// --- Voice Selection ---
+
+async function applyVoice() {
+  const voice = document.getElementById("voice-select").value;
+  const statusEl = document.getElementById("voice-status");
+
+  const formData = new FormData();
+  formData.append("voice", voice);
+  formData.append("lang", "a");
+
+  statusEl.textContent = "Applying...";
+  try {
+    const resp = await fetch("/api/set-voice", {
+      method: "POST",
+      body: formData,
+    });
+    const data = await resp.json();
+    if (data.status === "ok") {
+      statusEl.textContent = "Voice: " + voice;
+    } else {
+      statusEl.textContent = "Failed.";
+    }
+  } catch (err) {
+    statusEl.textContent = "Error: " + err.message;
+  }
+}
+
+// Expose to HTML onclick
+window.toggleMic = toggleMic;
+window.applyVoice = applyVoice;
@@ -0,0 +1,49 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  <title>Voice Chat</title>
+  <link rel="stylesheet" href="/static/style.css" />
+</head>
+<body>
+  <header>
+    <h1>Voice Chat</h1>
+    <span id="status-badge">Disconnected</span>
+  </header>
+
+  <div id="chat-area"></div>
+
+  <details id="voice-panel">
+    <summary>Voice Settings</summary>
+    <div class="panel-content">
+      <label>
+        Voice
+        <select id="voice-select">
+          <optgroup label="Female">
+            <option value="af_heart" selected>Heart</option>
+            <option value="af_nicole">Nicole</option>
+            <option value="af_bella">Bella</option>
+            <option value="af_sarah">Sarah</option>
+            <option value="af_nova">Nova</option>
+            <option value="af_jessica">Jessica</option>
+            <option value="af_river">River</option>
+          </optgroup>
+          <optgroup label="Male">
+            <option value="am_adam">Adam</option>
+            <option value="am_michael">Michael</option>
+          </optgroup>
+        </select>
+      </label>
+      <button id="apply-voice-btn" onclick="applyVoice()">Apply</button>
+      <span id="voice-status"></span>
+    </div>
+  </details>
+
+  <div id="controls">
+    <button id="mic-btn" onclick="toggleMic()">&#x1F3A4;</button>
+  </div>
+
+  <script src="/static/app.js"></script>
+</body>
+</html>
@@ -0,0 +1,42 @@
+/**
+ * AudioWorkletProcessor that collects 512-sample chunks of PCM audio
+ * and posts them to the main thread for WebSocket transmission.
+ */
+class PCMProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.buffer = new Float32Array(0);
+    this.chunkSize = 512; // 512 samples at 16kHz = 32ms
+  }
+
+  process(inputs) {
+    const input = inputs[0];
+    if (!input || !input[0]) return true;
+
+    const channelData = input[0]; // mono
+
+    // Append to buffer
+    const newBuffer = new Float32Array(this.buffer.length + channelData.length);
+    newBuffer.set(this.buffer);
+    newBuffer.set(channelData, this.buffer.length);
+    this.buffer = newBuffer;
+
+    // Send complete chunks
+    while (this.buffer.length >= this.chunkSize) {
+      const chunk = this.buffer.slice(0, this.chunkSize);
+      this.buffer = this.buffer.slice(this.chunkSize);
+
+      // Convert float32 to int16 for transmission
+      const int16 = new Int16Array(chunk.length);
+      for (let i = 0; i < chunk.length; i++) {
+        const s = Math.max(-1, Math.min(1, chunk[i]));
+        int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
+      }
+      this.port.postMessage(int16.buffer, [int16.buffer]);
+    }
+
+    return true;
+  }
+}
+
+registerProcessor("pcm-processor", PCMProcessor);
@@ -0,0 +1,185 @@
+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+}
+
+body {
+  font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
+  background: #0f0f0f;
+  color: #e0e0e0;
+  height: 100vh;
+  display: flex;
+  flex-direction: column;
+}
+
+header {
+  padding: 16px 24px;
+  border-bottom: 1px solid #222;
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+}
+
+header h1 {
+  font-size: 18px;
+  font-weight: 600;
+  color: #fff;
+}
+
+#status-badge {
+  padding: 4px 12px;
+  border-radius: 12px;
+  font-size: 13px;
+  font-weight: 500;
+  background: #1a1a2e;
+  color: #888;
+  transition: all 0.3s;
+}
+
+#status-badge.listening {
+  background: #0a2a1a;
+  color: #4ade80;
+}
+
+#status-badge.thinking {
+  background: #2a1a0a;
+  color: #fbbf24;
+}
+
+#status-badge.speaking {
+  background: #1a0a2a;
+  color: #a78bfa;
+}
+
+#chat-area {
+  flex: 1;
+  overflow-y: auto;
+  padding: 24px;
+  display: flex;
+  flex-direction: column;
+  gap: 12px;
+}
+
+.message {
+  max-width: 70%;
+  padding: 10px 16px;
+  border-radius: 16px;
+  font-size: 15px;
+  line-height: 1.5;
+  word-wrap: break-word;
+}
+
+.message.user {
+  align-self: flex-end;
+  background: #1d4ed8;
+  color: #fff;
+  border-bottom-right-radius: 4px;
+}
+
+.message.assistant {
+  align-self: flex-start;
+  background: #1e1e1e;
+  color: #e0e0e0;
+  border-bottom-left-radius: 4px;
+}
+
+#controls {
+  padding: 16px 24px;
+  border-top: 1px solid #222;
+  display: flex;
+  align-items: center;
+  gap: 16px;
+}
+
+#mic-btn {
+  width: 56px;
+  height: 56px;
+  border-radius: 50%;
+  border: 2px solid #333;
+  background: #1a1a1a;
+  color: #e0e0e0;
+  font-size: 24px;
+  cursor: pointer;
+  transition: all 0.2s;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+}
+
+#mic-btn:hover {
+  border-color: #555;
+  background: #222;
+}
+
+#mic-btn.active {
+  border-color: #ef4444;
+  background: #2a0a0a;
+  color: #ef4444;
+  animation: pulse 1.5s infinite;
+}
+
+@keyframes pulse {
+  0%, 100% { box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.3); }
+  50% { box-shadow: 0 0 0 12px rgba(239, 68, 68, 0); }
+}
+
+/* Voice clone panel */
+#voice-panel {
+  padding: 12px 24px;
+  border-top: 1px solid #222;
+  background: #0a0a0a;
+}
+
+#voice-panel summary {
+  cursor: pointer;
+  font-size: 13px;
+  color: #888;
+  user-select: none;
+}
+
+#voice-panel .panel-content {
+  margin-top: 12px;
+  display: flex;
+  gap: 12px;
+  align-items: flex-end;
+  flex-wrap: wrap;
+}
+
+#voice-panel label {
+  font-size: 13px;
+  color: #aaa;
+  display: flex;
+  flex-direction: column;
+  gap: 4px;
+}
+
+#voice-panel input[type="file"],
+#voice-panel input[type="text"] {
+  background: #1a1a1a;
+  border: 1px solid #333;
+  border-radius: 6px;
+  padding: 6px 10px;
+  color: #e0e0e0;
+  font-size: 13px;
+}
+
+#upload-btn {
+  padding: 6px 16px;
+  border-radius: 6px;
+  border: 1px solid #333;
+  background: #1a1a1a;
+  color: #e0e0e0;
+  font-size: 13px;
+  cursor: pointer;
+}
+
+#upload-btn:hover {
+  background: #222;
+}
+
+#upload-status {
+  font-size: 12px;
+  color: #888;
+  margin-left: 8px;
+}