Files
live-voice-chat/static/app.js
T
bhetherman 44a10667c2 Enhance video handling and performance optimizations
- Added environment variables to prevent CPU thread pools from busy-waiting.
- Deferred loading of video models until first use to reduce VRAM footprint.
- Implemented streaming of speaking clips for improved responsiveness.
- Introduced a queue for managing speaking clips to handle multiple requests smoothly.
- Updated video playback logic to ensure proper handling of clip generation.
2026-04-24 00:36:18 -04:00

536 lines
15 KiB
JavaScript

// --- State ---
let ws = null;
let audioCtx = null;
let micStream = null;
let workletNode = null;
let micActive = false;
let nextPlayTime = 0;
let isPlaying = false;
const PLAYBACK_SR = 24000; // TTS output sample rate
const MIC_SR = 16000;
const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
let bargeInCount = 0;
// --- Text-audio sync state ---
let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user
// --- Video mode state ---
let videoModeEnabled = false; // true when server has video engine active AND ready
let videoModeName = "off"; // "off" | "library" | "reflective"
let idleClipUrl = null; // URL string (server-served) or null
let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
let currentSpeakingClipBlobUrl = null;
let speakingClipQueue = []; // [{blobUrl, meta}] clips waiting to play
let currentClipGeneration = 0; // incremented each clip start; guards stale onended handlers
const chatArea = document.getElementById("chat-area");
const statusBadge = document.getElementById("status-badge");
const micBtn = document.getElementById("mic-btn");
const avatarVideo = document.getElementById("avatar-video");
const stageEl = document.getElementById("stage");
// --- WebSocket ---
function connectWS() {
const proto = location.protocol === "https:" ? "wss:" : "ws:";
ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus("listening");
};
ws.onclose = () => {
setStatus("disconnected");
setTimeout(connectWS, 2000);
};
ws.onerror = () => {
ws.close();
};
ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
// In video mode, the next binary frame after a "speaking_clip"
// envelope is an MP4 blob; otherwise it's a PCM audio chunk.
if (pendingSpeakingClipMeta) {
const meta = pendingSpeakingClipMeta;
pendingSpeakingClipMeta = null;
playSpeakingClip(event.data, meta);
} else if (videoModeEnabled) {
// Video mode is active but we didn't get a speaking_clip envelope
// first — ignore raw PCM so we don't double-play audio.
} else {
playAudioChunk(event.data);
}
} else {
handleJSON(JSON.parse(event.data));
}
};
}
function handleJSON(msg) {
switch (msg.type) {
case "status":
setStatus(msg.state);
break;
case "interrupt":
stopPlayback();
stopSpeakingClip();
// Finalize with interrupted marker — text already reflects only what was heard
finalizeAssistantMessage(true);
break;
case "transcript":
addMessage("user", msg.text);
break;
case "response_text":
if (msg.final) {
// All chunks sent; finalize will happen when last audio chunk plays
// (or immediately if nothing was queued)
if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
finalizeAssistantMessage(false);
}
// Otherwise, playAudioChunk will finalize after the last scheduled text
} else {
// Queue text — it will be displayed when corresponding audio starts playing
pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
}
break;
case "video_mode":
// Sent once on WS open. Toggles the video element + speaking-clip path.
applyVideoModeState(msg);
break;
case "speaking_clip":
// Envelope preceding an MP4 binary frame with the full turn.
pendingSpeakingClipMeta = {
chunk_id: msg.chunk_id,
duration_ms: msg.duration_ms,
text: msg.text,
};
break;
}
}
// --- Video mode ------------------------------------------------------------
function applyVideoModeState(msg) {
videoModeEnabled = !!msg.enabled && !!msg.ready;
videoModeName = msg.mode || "off";
idleClipUrl = msg.idle_clip_url || null;
refreshStage();
}
function refreshStage() {
if (videoModeEnabled && idleClipUrl) {
stageEl.classList.add("active");
if (avatarVideo.src !== location.origin + idleClipUrl) {
_returnToIdle();
}
} else {
stageEl.classList.remove("active");
}
}
function _returnToIdle() {
if (!idleClipUrl) return;
avatarVideo.onended = null;
avatarVideo.loop = false;
avatarVideo.muted = true;
avatarVideo.src = idleClipUrl;
avatarVideo.play().catch(() => {});
}
function playSpeakingClip(arrayBuffer, meta) {
const blob = new Blob([arrayBuffer], { type: "video/mp4" });
const blobUrl = URL.createObjectURL(blob);
if (currentSpeakingClipBlobUrl !== null) {
// A clip is already playing — queue this one.
speakingClipQueue.push({ blobUrl, meta });
} else {
_startSpeakingClip(blobUrl, meta);
}
}
function _startSpeakingClip(blobUrl, meta) {
const gen = ++currentClipGeneration;
if (currentSpeakingClipBlobUrl) {
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
}
currentSpeakingClipBlobUrl = blobUrl;
avatarVideo.loop = false;
avatarVideo.muted = false;
avatarVideo.src = blobUrl;
if (meta && meta.text) {
appendAssistantText(meta.text);
}
isPlaying = true;
avatarVideo.onended = () => {
if (currentClipGeneration !== gen) return; // stale handler from a replaced clip
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
currentSpeakingClipBlobUrl = null;
const next = speakingClipQueue.shift();
if (next) {
_startSpeakingClip(next.blobUrl, next.meta);
} else {
isPlaying = false;
finalizeAssistantMessage(false);
_returnToIdle();
}
};
avatarVideo.play().catch((e) => {
console.error("speaking clip play failed:", e);
});
}
function stopSpeakingClip() {
// Discard any queued clips.
for (const { blobUrl } of speakingClipQueue) {
URL.revokeObjectURL(blobUrl);
}
speakingClipQueue = [];
currentClipGeneration++; // invalidate any in-flight onended handlers
if (!currentSpeakingClipBlobUrl) return;
try { avatarVideo.pause(); } catch (_) {}
avatarVideo.onended = null;
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
currentSpeakingClipBlobUrl = null;
isPlaying = false;
_returnToIdle();
}
async function uploadAvatar() {
const fileInput = document.getElementById("avatar-file");
const status = document.getElementById("avatar-status");
if (!fileInput.files || !fileInput.files[0]) {
status.textContent = "Pick an image first.";
return;
}
status.textContent = "Uploading and rendering idle clip (this takes a while)...";
const fd = new FormData();
fd.append("image", fileInput.files[0]);
try {
const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
if (!resp.ok) throw new Error(await resp.text());
const data = await resp.json();
idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
videoModeEnabled = true;
videoModeName = data.mode || videoModeName;
refreshStage();
status.textContent = "Avatar ready (" + data.mode + ")";
} catch (err) {
console.error(err);
status.textContent = "Failed: " + err.message;
}
}
async function applyVideoMode() {
const sel = document.getElementById("video-mode-select");
const status = document.getElementById("avatar-status");
const fd = new FormData();
fd.append("mode", sel.value);
try {
const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
if (!resp.ok) throw new Error(await resp.text());
const data = await resp.json();
videoModeName = data.mode;
if (data.mode === "off") {
videoModeEnabled = false;
stageEl.classList.remove("active");
}
status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
} catch (err) {
status.textContent = "Failed: " + err.message;
}
}
// --- Status ---
function setStatus(state) {
statusBadge.textContent =
state === "listening"
? "Listening"
: state === "thinking"
? "Thinking..."
: state === "speaking"
? "Speaking"
: state === "disconnected"
? "Disconnected"
: state;
statusBadge.className = state;
}
// --- Chat Messages ---
let currentAssistantEl = null;
let currentAssistantText = "";
function addMessage(role, text) {
const el = document.createElement("div");
el.className = `message ${role}`;
el.textContent = text;
chatArea.appendChild(el);
chatArea.scrollTop = chatArea.scrollHeight;
}
function appendAssistantText(text) {
if (!currentAssistantEl) {
currentAssistantEl = document.createElement("div");
currentAssistantEl.className = "message assistant";
chatArea.appendChild(currentAssistantEl);
currentAssistantText = "";
}
currentAssistantText += (currentAssistantText ? " " : "") + text;
currentAssistantEl.textContent = currentAssistantText;
chatArea.scrollTop = chatArea.scrollHeight;
}
function finalizeAssistantMessage(interrupted = false) {
if (interrupted && currentAssistantEl && currentAssistantText) {
const marker = document.createElement("span");
marker.className = "interrupted-marker";
marker.textContent = " [interrupted]";
currentAssistantEl.appendChild(marker);
}
currentAssistantEl = null;
currentAssistantText = "";
// Reset sync state
pendingTextChunks = [];
for (const tid of scheduledTextTimers) clearTimeout(tid);
scheduledTextTimers = [];
lastDisplayedChunkId = -1;
}
// --- Audio Playback ---
let activeSources = [];
function getPlaybackCtx() {
if (!audioCtx || audioCtx.state === "closed") {
audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
}
return audioCtx;
}
function playAudioChunk(arrayBuffer) {
const ctx = getPlaybackCtx();
const int16 = new Int16Array(arrayBuffer);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
buffer.getChannelData(0).set(float32);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
activeSources.push(source);
isPlaying = true;
// Pair this audio chunk with the next queued text chunk
const textEntry = pendingTextChunks.shift();
const now = ctx.currentTime;
if (nextPlayTime < now) {
nextPlayTime = now + 0.01;
}
// Schedule text display to coincide with audio playback start
if (textEntry) {
const delayMs = Math.max(0, (nextPlayTime - now) * 1000);
const tid = setTimeout(() => {
appendAssistantText(textEntry.text);
lastDisplayedChunkId = textEntry.chunkId;
scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid);
}, delayMs);
scheduledTextTimers.push(tid);
}
source.onended = () => {
activeSources = activeSources.filter((s) => s !== source);
if (activeSources.length === 0) {
isPlaying = false;
bargeInCount = 0;
// If all audio has finished and no more text pending, finalize
if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
finalizeAssistantMessage(false);
}
}
};
source.start(nextPlayTime);
nextPlayTime += buffer.duration;
}
function stopPlayback() {
for (const source of activeSources) {
try {
source.stop();
} catch (_) {}
}
activeSources = [];
nextPlayTime = 0;
isPlaying = false;
bargeInCount = 0;
// Cancel any pending text displays
for (const tid of scheduledTextTimers) clearTimeout(tid);
scheduledTextTimers = [];
pendingTextChunks = [];
}
// --- Microphone ---
async function toggleMic() {
if (micActive) {
stopMic();
} else {
await startMic();
}
}
async function startMic() {
try {
// Ensure playback context exists (needed for user gesture)
getPlaybackCtx();
if (audioCtx.state === "suspended") {
await audioCtx.resume();
}
micStream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: MIC_SR,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
// Create a separate context at 16kHz for mic capture
const micCtx = new AudioContext({ sampleRate: MIC_SR });
const source = micCtx.createMediaStreamSource(micStream);
await micCtx.audioWorklet.addModule("/static/processor.js");
workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
source.connect(workletNode);
workletNode.port.onmessage = (e) => {
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(e.data);
// Client-side barge-in: detect mic energy while playing
if (isPlaying) {
const samples = new Int16Array(e.data);
let sum = 0;
for (let i = 0; i < samples.length; i++) {
const s = samples[i] / 32768;
sum += s * s;
}
const rms = Math.sqrt(sum / samples.length);
if (rms > BARGE_IN_THRESHOLD) {
bargeInCount++;
if (bargeInCount >= BARGE_IN_FRAMES) {
// User is speaking over the assistant - interrupt
stopPlayback();
stopSpeakingClip();
const msg = { type: "interrupt" };
if (lastDisplayedChunkId >= 0) {
msg.last_chunk_id = lastDisplayedChunkId;
}
ws.send(JSON.stringify(msg));
finalizeAssistantMessage(true);
isPlaying = false;
bargeInCount = 0;
}
} else {
bargeInCount = 0;
}
}
}
};
// Store for cleanup
workletNode._micCtx = micCtx;
micActive = true;
micBtn.classList.add("active");
// Connect WebSocket if not already
if (!ws || ws.readyState !== WebSocket.OPEN) {
connectWS();
}
} catch (err) {
console.error("Mic access failed:", err);
alert("Could not access microphone. Please allow mic permissions.");
}
}
function stopMic() {
if (workletNode) {
workletNode.disconnect();
if (workletNode._micCtx) {
workletNode._micCtx.close();
}
workletNode = null;
}
if (micStream) {
micStream.getTracks().forEach((t) => t.stop());
micStream = null;
}
micActive = false;
micBtn.classList.remove("active");
}
// --- Voice Selection ---
async function applyVoice() {
const voice = document.getElementById("voice-select").value;
const statusEl = document.getElementById("voice-status");
const formData = new FormData();
formData.append("voice", voice);
formData.append("lang", "a");
statusEl.textContent = "Applying...";
try {
const resp = await fetch("/api/set-voice", {
method: "POST",
body: formData,
});
const data = await resp.json();
if (data.status === "ok") {
statusEl.textContent = "Voice: " + voice;
} else {
statusEl.textContent = "Failed.";
}
} catch (err) {
statusEl.textContent = "Error: " + err.message;
}
}
// Expose to HTML onclick
window.toggleMic = toggleMic;
window.applyVoice = applyVoice;
window.uploadAvatar = uploadAvatar;
window.applyVideoMode = applyVideoMode;