// --- State --- let ws = null; let audioCtx = null; let micStream = null; let workletNode = null; let micActive = false; let nextPlayTime = 0; let isPlaying = false; const PLAYBACK_SR = 24000; // TTS output sample rate const MIC_SR = 16000; const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger let bargeInCount = 0; // --- Text-audio sync state --- let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user const chatArea = document.getElementById("chat-area"); const statusBadge = document.getElementById("status-badge"); const micBtn = document.getElementById("mic-btn"); // --- WebSocket --- function connectWS() { const proto = location.protocol === "https:" ? "wss:" : "ws:"; ws = new WebSocket(`${proto}//${location.host}/ws/chat`); ws.binaryType = "arraybuffer"; ws.onopen = () => { setStatus("listening"); }; ws.onclose = () => { setStatus("disconnected"); setTimeout(connectWS, 2000); }; ws.onerror = () => { ws.close(); }; ws.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { playAudioChunk(event.data); } else { handleJSON(JSON.parse(event.data)); } }; } function handleJSON(msg) { switch (msg.type) { case "status": setStatus(msg.state); break; case "interrupt": stopPlayback(); // Finalize with interrupted marker — text already reflects only what was heard finalizeAssistantMessage(true); break; case "transcript": addMessage("user", msg.text); break; case "response_text": if (msg.final) { // All chunks sent; finalize will happen when last audio chunk plays // (or immediately if nothing was queued) if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) { finalizeAssistantMessage(false); } // Otherwise, playAudioChunk will finalize after the last scheduled text } else { // Queue text — it will be displayed when corresponding audio starts playing pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text }); } break; } } // --- Status --- function setStatus(state) { statusBadge.textContent = state === "listening" ? "Listening" : state === "thinking" ? "Thinking..." : state === "speaking" ? "Speaking" : state === "disconnected" ? "Disconnected" : state; statusBadge.className = state; } // --- Chat Messages --- let currentAssistantEl = null; let currentAssistantText = ""; function addMessage(role, text) { const el = document.createElement("div"); el.className = `message ${role}`; el.textContent = text; chatArea.appendChild(el); chatArea.scrollTop = chatArea.scrollHeight; } function appendAssistantText(text) { if (!currentAssistantEl) { currentAssistantEl = document.createElement("div"); currentAssistantEl.className = "message assistant"; chatArea.appendChild(currentAssistantEl); currentAssistantText = ""; } currentAssistantText += (currentAssistantText ? " " : "") + text; currentAssistantEl.textContent = currentAssistantText; chatArea.scrollTop = chatArea.scrollHeight; } function finalizeAssistantMessage(interrupted = false) { if (interrupted && currentAssistantEl && currentAssistantText) { const marker = document.createElement("span"); marker.className = "interrupted-marker"; marker.textContent = " [interrupted]"; currentAssistantEl.appendChild(marker); } currentAssistantEl = null; currentAssistantText = ""; // Reset sync state pendingTextChunks = []; for (const tid of scheduledTextTimers) clearTimeout(tid); scheduledTextTimers = []; lastDisplayedChunkId = -1; } // --- Audio Playback --- let activeSources = []; function getPlaybackCtx() { if (!audioCtx || audioCtx.state === "closed") { audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR }); } return audioCtx; } function playAudioChunk(arrayBuffer) { const ctx = getPlaybackCtx(); const int16 = new Int16Array(arrayBuffer); const float32 = new Float32Array(int16.length); for (let i = 0; i < int16.length; i++) { float32[i] = int16[i] / 32768; } const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR); buffer.getChannelData(0).set(float32); const source = ctx.createBufferSource(); source.buffer = buffer; source.connect(ctx.destination); activeSources.push(source); isPlaying = true; // Pair this audio chunk with the next queued text chunk const textEntry = pendingTextChunks.shift(); const now = ctx.currentTime; if (nextPlayTime < now) { nextPlayTime = now + 0.01; } // Schedule text display to coincide with audio playback start if (textEntry) { const delayMs = Math.max(0, (nextPlayTime - now) * 1000); const tid = setTimeout(() => { appendAssistantText(textEntry.text); lastDisplayedChunkId = textEntry.chunkId; scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid); }, delayMs); scheduledTextTimers.push(tid); } source.onended = () => { activeSources = activeSources.filter((s) => s !== source); if (activeSources.length === 0) { isPlaying = false; bargeInCount = 0; // If all audio has finished and no more text pending, finalize if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) { finalizeAssistantMessage(false); } } }; source.start(nextPlayTime); nextPlayTime += buffer.duration; } function stopPlayback() { for (const source of activeSources) { try { source.stop(); } catch (_) {} } activeSources = []; nextPlayTime = 0; isPlaying = false; bargeInCount = 0; // Cancel any pending text displays for (const tid of scheduledTextTimers) clearTimeout(tid); scheduledTextTimers = []; pendingTextChunks = []; } // --- Microphone --- async function toggleMic() { if (micActive) { stopMic(); } else { await startMic(); } } async function startMic() { try { // Ensure playback context exists (needed for user gesture) getPlaybackCtx(); if (audioCtx.state === "suspended") { await audioCtx.resume(); } micStream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: MIC_SR, channelCount: 1, echoCancellation: true, noiseSuppression: true, autoGainControl: true, }, }); // Create a separate context at 16kHz for mic capture const micCtx = new AudioContext({ sampleRate: MIC_SR }); const source = micCtx.createMediaStreamSource(micStream); await micCtx.audioWorklet.addModule("/static/processor.js"); workletNode = new AudioWorkletNode(micCtx, "pcm-processor"); source.connect(workletNode); workletNode.port.onmessage = (e) => { if (ws && ws.readyState === WebSocket.OPEN) { ws.send(e.data); // Client-side barge-in: detect mic energy while playing if (isPlaying) { const samples = new Int16Array(e.data); let sum = 0; for (let i = 0; i < samples.length; i++) { const s = samples[i] / 32768; sum += s * s; } const rms = Math.sqrt(sum / samples.length); if (rms > BARGE_IN_THRESHOLD) { bargeInCount++; if (bargeInCount >= BARGE_IN_FRAMES) { // User is speaking over the assistant - interrupt stopPlayback(); const msg = { type: "interrupt" }; if (lastDisplayedChunkId >= 0) { msg.last_chunk_id = lastDisplayedChunkId; } ws.send(JSON.stringify(msg)); finalizeAssistantMessage(true); isPlaying = false; bargeInCount = 0; } } else { bargeInCount = 0; } } } }; // Store for cleanup workletNode._micCtx = micCtx; micActive = true; micBtn.classList.add("active"); // Connect WebSocket if not already if (!ws || ws.readyState !== WebSocket.OPEN) { connectWS(); } } catch (err) { console.error("Mic access failed:", err); alert("Could not access microphone. Please allow mic permissions."); } } function stopMic() { if (workletNode) { workletNode.disconnect(); if (workletNode._micCtx) { workletNode._micCtx.close(); } workletNode = null; } if (micStream) { micStream.getTracks().forEach((t) => t.stop()); micStream = null; } micActive = false; micBtn.classList.remove("active"); } // --- Voice Selection --- async function applyVoice() { const voice = document.getElementById("voice-select").value; const statusEl = document.getElementById("voice-status"); const formData = new FormData(); formData.append("voice", voice); formData.append("lang", "a"); statusEl.textContent = "Applying..."; try { const resp = await fetch("/api/set-voice", { method: "POST", body: formData, }); const data = await resp.json(); if (data.status === "ok") { statusEl.textContent = "Voice: " + voice; } else { statusEl.textContent = "Failed."; } } catch (err) { statusEl.textContent = "Error: " + err.message; } } // Expose to HTML onclick window.toggleMic = toggleMic; window.applyVoice = applyVoice;