// --- State --- let ws = null; let audioCtx = null; let micStream = null; let workletNode = null; let micActive = false; let nextPlayTime = 0; let isPlaying = false; const PLAYBACK_SR = 24000; // TTS output sample rate const MIC_SR = 16000; const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger let bargeInCount = 0; // --- Text-audio sync state --- let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user // --- Video mode state --- let videoModeEnabled = false; // true when server has video engine active AND ready let videoModeName = "off"; // "off" | "library" | "reflective" let idleClipUrl = null; // URL string (server-served) or null let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary let currentSpeakingClipBlobUrl = null; let speakingClipQueue = []; // [{blobUrl, meta}] clips waiting to play let currentClipGeneration = 0; // incremented each clip start; guards stale onended handlers const chatArea = document.getElementById("chat-area"); const statusBadge = document.getElementById("status-badge"); const micBtn = document.getElementById("mic-btn"); const avatarVideo = document.getElementById("avatar-video"); const stageEl = document.getElementById("stage"); // --- WebSocket --- function connectWS() { const proto = location.protocol === "https:" ? "wss:" : "ws:"; ws = new WebSocket(`${proto}//${location.host}/ws/chat`); ws.binaryType = "arraybuffer"; ws.onopen = () => { setStatus("listening"); }; ws.onclose = () => { setStatus("disconnected"); setTimeout(connectWS, 2000); }; ws.onerror = () => { ws.close(); }; ws.onmessage = (event) => { if (event.data instanceof ArrayBuffer) { // In video mode, the next binary frame after a "speaking_clip" // envelope is an MP4 blob; otherwise it's a PCM audio chunk. if (pendingSpeakingClipMeta) { const meta = pendingSpeakingClipMeta; pendingSpeakingClipMeta = null; playSpeakingClip(event.data, meta); } else if (videoModeEnabled) { // Video mode is active but we didn't get a speaking_clip envelope // first — ignore raw PCM so we don't double-play audio. } else { playAudioChunk(event.data); } } else { handleJSON(JSON.parse(event.data)); } }; } function handleJSON(msg) { switch (msg.type) { case "status": setStatus(msg.state); break; case "interrupt": stopPlayback(); stopSpeakingClip(); // Finalize with interrupted marker — text already reflects only what was heard finalizeAssistantMessage(true); break; case "transcript": addMessage("user", msg.text); break; case "response_text": if (msg.final) { // All chunks sent; finalize will happen when last audio chunk plays // (or immediately if nothing was queued) if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) { finalizeAssistantMessage(false); } // Otherwise, playAudioChunk will finalize after the last scheduled text } else { // Queue text — it will be displayed when corresponding audio starts playing pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text }); } break; case "video_mode": // Sent once on WS open. Toggles the video element + speaking-clip path. applyVideoModeState(msg); break; case "speaking_clip": // Envelope preceding an MP4 binary frame with the full turn. pendingSpeakingClipMeta = { chunk_id: msg.chunk_id, duration_ms: msg.duration_ms, text: msg.text, }; break; } } // --- Video mode ------------------------------------------------------------ function applyVideoModeState(msg) { videoModeEnabled = !!msg.enabled && !!msg.ready; videoModeName = msg.mode || "off"; idleClipUrl = msg.idle_clip_url || null; refreshStage(); } function refreshStage() { if (videoModeEnabled && idleClipUrl) { stageEl.classList.add("active"); if (avatarVideo.src !== location.origin + idleClipUrl) { _returnToIdle(); } } else { stageEl.classList.remove("active"); } } function _returnToIdle() { if (!idleClipUrl) return; avatarVideo.onended = null; avatarVideo.loop = false; avatarVideo.muted = true; avatarVideo.src = idleClipUrl; avatarVideo.play().catch(() => {}); } function playSpeakingClip(arrayBuffer, meta) { const blob = new Blob([arrayBuffer], { type: "video/mp4" }); const blobUrl = URL.createObjectURL(blob); if (currentSpeakingClipBlobUrl !== null) { // A clip is already playing — queue this one. speakingClipQueue.push({ blobUrl, meta }); } else { _startSpeakingClip(blobUrl, meta); } } function _startSpeakingClip(blobUrl, meta) { const gen = ++currentClipGeneration; if (currentSpeakingClipBlobUrl) { URL.revokeObjectURL(currentSpeakingClipBlobUrl); } currentSpeakingClipBlobUrl = blobUrl; avatarVideo.loop = false; avatarVideo.muted = false; avatarVideo.src = blobUrl; if (meta && meta.text) { appendAssistantText(meta.text); } isPlaying = true; avatarVideo.onended = () => { if (currentClipGeneration !== gen) return; // stale handler from a replaced clip URL.revokeObjectURL(currentSpeakingClipBlobUrl); currentSpeakingClipBlobUrl = null; const next = speakingClipQueue.shift(); if (next) { _startSpeakingClip(next.blobUrl, next.meta); } else { isPlaying = false; finalizeAssistantMessage(false); _returnToIdle(); } }; avatarVideo.play().catch((e) => { console.error("speaking clip play failed:", e); }); } function stopSpeakingClip() { // Discard any queued clips. for (const { blobUrl } of speakingClipQueue) { URL.revokeObjectURL(blobUrl); } speakingClipQueue = []; currentClipGeneration++; // invalidate any in-flight onended handlers if (!currentSpeakingClipBlobUrl) return; try { avatarVideo.pause(); } catch (_) {} avatarVideo.onended = null; URL.revokeObjectURL(currentSpeakingClipBlobUrl); currentSpeakingClipBlobUrl = null; isPlaying = false; _returnToIdle(); } async function uploadAvatar() { const fileInput = document.getElementById("avatar-file"); const status = document.getElementById("avatar-status"); if (!fileInput.files || !fileInput.files[0]) { status.textContent = "Pick an image first."; return; } status.textContent = "Uploading and rendering idle clip (this takes a while)..."; const fd = new FormData(); fd.append("image", fileInput.files[0]); try { const resp = await fetch("/api/set-avatar", { method: "POST", body: fd }); if (!resp.ok) throw new Error(await resp.text()); const data = await resp.json(); idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust videoModeEnabled = true; videoModeName = data.mode || videoModeName; refreshStage(); status.textContent = "Avatar ready (" + data.mode + ")"; } catch (err) { console.error(err); status.textContent = "Failed: " + err.message; } } async function applyVideoMode() { const sel = document.getElementById("video-mode-select"); const status = document.getElementById("avatar-status"); const fd = new FormData(); fd.append("mode", sel.value); try { const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd }); if (!resp.ok) throw new Error(await resp.text()); const data = await resp.json(); videoModeName = data.mode; if (data.mode === "off") { videoModeEnabled = false; stageEl.classList.remove("active"); } status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : ""); } catch (err) { status.textContent = "Failed: " + err.message; } } // --- Status --- function setStatus(state) { statusBadge.textContent = state === "listening" ? "Listening" : state === "thinking" ? "Thinking..." : state === "speaking" ? "Speaking" : state === "disconnected" ? "Disconnected" : state; statusBadge.className = state; } // --- Chat Messages --- let currentAssistantEl = null; let currentAssistantText = ""; function addMessage(role, text) { const el = document.createElement("div"); el.className = `message ${role}`; el.textContent = text; chatArea.appendChild(el); chatArea.scrollTop = chatArea.scrollHeight; } function appendAssistantText(text) { if (!currentAssistantEl) { currentAssistantEl = document.createElement("div"); currentAssistantEl.className = "message assistant"; chatArea.appendChild(currentAssistantEl); currentAssistantText = ""; } currentAssistantText += (currentAssistantText ? " " : "") + text; currentAssistantEl.textContent = currentAssistantText; chatArea.scrollTop = chatArea.scrollHeight; } function finalizeAssistantMessage(interrupted = false) { if (interrupted && currentAssistantEl && currentAssistantText) { const marker = document.createElement("span"); marker.className = "interrupted-marker"; marker.textContent = " [interrupted]"; currentAssistantEl.appendChild(marker); } currentAssistantEl = null; currentAssistantText = ""; // Reset sync state pendingTextChunks = []; for (const tid of scheduledTextTimers) clearTimeout(tid); scheduledTextTimers = []; lastDisplayedChunkId = -1; } // --- Audio Playback --- let activeSources = []; function getPlaybackCtx() { if (!audioCtx || audioCtx.state === "closed") { audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR }); } return audioCtx; } function playAudioChunk(arrayBuffer) { const ctx = getPlaybackCtx(); const int16 = new Int16Array(arrayBuffer); const float32 = new Float32Array(int16.length); for (let i = 0; i < int16.length; i++) { float32[i] = int16[i] / 32768; } const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR); buffer.getChannelData(0).set(float32); const source = ctx.createBufferSource(); source.buffer = buffer; source.connect(ctx.destination); activeSources.push(source); isPlaying = true; // Pair this audio chunk with the next queued text chunk const textEntry = pendingTextChunks.shift(); const now = ctx.currentTime; if (nextPlayTime < now) { nextPlayTime = now + 0.01; } // Schedule text display to coincide with audio playback start if (textEntry) { const delayMs = Math.max(0, (nextPlayTime - now) * 1000); const tid = setTimeout(() => { appendAssistantText(textEntry.text); lastDisplayedChunkId = textEntry.chunkId; scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid); }, delayMs); scheduledTextTimers.push(tid); } source.onended = () => { activeSources = activeSources.filter((s) => s !== source); if (activeSources.length === 0) { isPlaying = false; bargeInCount = 0; // If all audio has finished and no more text pending, finalize if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) { finalizeAssistantMessage(false); } } }; source.start(nextPlayTime); nextPlayTime += buffer.duration; } function stopPlayback() { for (const source of activeSources) { try { source.stop(); } catch (_) {} } activeSources = []; nextPlayTime = 0; isPlaying = false; bargeInCount = 0; // Cancel any pending text displays for (const tid of scheduledTextTimers) clearTimeout(tid); scheduledTextTimers = []; pendingTextChunks = []; } // --- Microphone --- async function toggleMic() { if (micActive) { stopMic(); } else { await startMic(); } } async function startMic() { try { // Ensure playback context exists (needed for user gesture) getPlaybackCtx(); if (audioCtx.state === "suspended") { await audioCtx.resume(); } micStream = await navigator.mediaDevices.getUserMedia({ audio: { sampleRate: MIC_SR, channelCount: 1, echoCancellation: true, noiseSuppression: true, autoGainControl: true, }, }); // Create a separate context at 16kHz for mic capture const micCtx = new AudioContext({ sampleRate: MIC_SR }); const source = micCtx.createMediaStreamSource(micStream); await micCtx.audioWorklet.addModule("/static/processor.js"); workletNode = new AudioWorkletNode(micCtx, "pcm-processor"); source.connect(workletNode); workletNode.port.onmessage = (e) => { if (ws && ws.readyState === WebSocket.OPEN) { ws.send(e.data); // Client-side barge-in: detect mic energy while playing if (isPlaying) { const samples = new Int16Array(e.data); let sum = 0; for (let i = 0; i < samples.length; i++) { const s = samples[i] / 32768; sum += s * s; } const rms = Math.sqrt(sum / samples.length); if (rms > BARGE_IN_THRESHOLD) { bargeInCount++; if (bargeInCount >= BARGE_IN_FRAMES) { // User is speaking over the assistant - interrupt stopPlayback(); stopSpeakingClip(); const msg = { type: "interrupt" }; if (lastDisplayedChunkId >= 0) { msg.last_chunk_id = lastDisplayedChunkId; } ws.send(JSON.stringify(msg)); finalizeAssistantMessage(true); isPlaying = false; bargeInCount = 0; } } else { bargeInCount = 0; } } } }; // Store for cleanup workletNode._micCtx = micCtx; micActive = true; micBtn.classList.add("active"); // Connect WebSocket if not already if (!ws || ws.readyState !== WebSocket.OPEN) { connectWS(); } } catch (err) { console.error("Mic access failed:", err); alert("Could not access microphone. Please allow mic permissions."); } } function stopMic() { if (workletNode) { workletNode.disconnect(); if (workletNode._micCtx) { workletNode._micCtx.close(); } workletNode = null; } if (micStream) { micStream.getTracks().forEach((t) => t.stop()); micStream = null; } micActive = false; micBtn.classList.remove("active"); } // --- Voice Selection --- async function applyVoice() { const voice = document.getElementById("voice-select").value; const statusEl = document.getElementById("voice-status"); const formData = new FormData(); formData.append("voice", voice); formData.append("lang", "a"); statusEl.textContent = "Applying..."; try { const resp = await fetch("/api/set-voice", { method: "POST", body: formData, }); const data = await resp.json(); if (data.status === "ok") { statusEl.textContent = "Voice: " + voice; } else { statusEl.textContent = "Failed."; } } catch (err) { statusEl.textContent = "Error: " + err.message; } } // Expose to HTML onclick window.toggleMic = toggleMic; window.applyVoice = applyVoice; window.uploadAvatar = uploadAvatar; window.applyVideoMode = applyVideoMode;