44a10667c2
- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
536 lines
15 KiB
JavaScript
536 lines
15 KiB
JavaScript
// --- State ---
|
|
let ws = null;
|
|
let audioCtx = null;
|
|
let micStream = null;
|
|
let workletNode = null;
|
|
let micActive = false;
|
|
let nextPlayTime = 0;
|
|
let isPlaying = false;
|
|
|
|
const PLAYBACK_SR = 24000; // TTS output sample rate
|
|
const MIC_SR = 16000;
|
|
const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
|
|
const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
|
|
let bargeInCount = 0;
|
|
|
|
// --- Text-audio sync state ---
|
|
let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive
|
|
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
|
|
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user
|
|
|
|
// --- Video mode state ---
|
|
let videoModeEnabled = false; // true when server has video engine active AND ready
|
|
let videoModeName = "off"; // "off" | "library" | "reflective"
|
|
let idleClipUrl = null; // URL string (server-served) or null
|
|
let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
|
|
let currentSpeakingClipBlobUrl = null;
|
|
let speakingClipQueue = []; // [{blobUrl, meta}] clips waiting to play
|
|
let currentClipGeneration = 0; // incremented each clip start; guards stale onended handlers
|
|
|
|
const chatArea = document.getElementById("chat-area");
|
|
const statusBadge = document.getElementById("status-badge");
|
|
const micBtn = document.getElementById("mic-btn");
|
|
const avatarVideo = document.getElementById("avatar-video");
|
|
const stageEl = document.getElementById("stage");
|
|
|
|
// --- WebSocket ---
|
|
|
|
function connectWS() {
|
|
const proto = location.protocol === "https:" ? "wss:" : "ws:";
|
|
ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
|
|
ws.binaryType = "arraybuffer";
|
|
|
|
ws.onopen = () => {
|
|
setStatus("listening");
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
setStatus("disconnected");
|
|
setTimeout(connectWS, 2000);
|
|
};
|
|
|
|
ws.onerror = () => {
|
|
ws.close();
|
|
};
|
|
|
|
ws.onmessage = (event) => {
|
|
if (event.data instanceof ArrayBuffer) {
|
|
// In video mode, the next binary frame after a "speaking_clip"
|
|
// envelope is an MP4 blob; otherwise it's a PCM audio chunk.
|
|
if (pendingSpeakingClipMeta) {
|
|
const meta = pendingSpeakingClipMeta;
|
|
pendingSpeakingClipMeta = null;
|
|
playSpeakingClip(event.data, meta);
|
|
} else if (videoModeEnabled) {
|
|
// Video mode is active but we didn't get a speaking_clip envelope
|
|
// first — ignore raw PCM so we don't double-play audio.
|
|
} else {
|
|
playAudioChunk(event.data);
|
|
}
|
|
} else {
|
|
handleJSON(JSON.parse(event.data));
|
|
}
|
|
};
|
|
}
|
|
|
|
function handleJSON(msg) {
|
|
switch (msg.type) {
|
|
case "status":
|
|
setStatus(msg.state);
|
|
break;
|
|
|
|
case "interrupt":
|
|
stopPlayback();
|
|
stopSpeakingClip();
|
|
// Finalize with interrupted marker — text already reflects only what was heard
|
|
finalizeAssistantMessage(true);
|
|
break;
|
|
|
|
case "transcript":
|
|
addMessage("user", msg.text);
|
|
break;
|
|
|
|
case "response_text":
|
|
if (msg.final) {
|
|
// All chunks sent; finalize will happen when last audio chunk plays
|
|
// (or immediately if nothing was queued)
|
|
if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
|
|
finalizeAssistantMessage(false);
|
|
}
|
|
// Otherwise, playAudioChunk will finalize after the last scheduled text
|
|
} else {
|
|
// Queue text — it will be displayed when corresponding audio starts playing
|
|
pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
|
|
}
|
|
break;
|
|
|
|
case "video_mode":
|
|
// Sent once on WS open. Toggles the video element + speaking-clip path.
|
|
applyVideoModeState(msg);
|
|
break;
|
|
|
|
case "speaking_clip":
|
|
// Envelope preceding an MP4 binary frame with the full turn.
|
|
pendingSpeakingClipMeta = {
|
|
chunk_id: msg.chunk_id,
|
|
duration_ms: msg.duration_ms,
|
|
text: msg.text,
|
|
};
|
|
break;
|
|
}
|
|
}
|
|
|
|
// --- Video mode ------------------------------------------------------------
|
|
|
|
function applyVideoModeState(msg) {
|
|
videoModeEnabled = !!msg.enabled && !!msg.ready;
|
|
videoModeName = msg.mode || "off";
|
|
idleClipUrl = msg.idle_clip_url || null;
|
|
refreshStage();
|
|
}
|
|
|
|
function refreshStage() {
|
|
if (videoModeEnabled && idleClipUrl) {
|
|
stageEl.classList.add("active");
|
|
if (avatarVideo.src !== location.origin + idleClipUrl) {
|
|
_returnToIdle();
|
|
}
|
|
} else {
|
|
stageEl.classList.remove("active");
|
|
}
|
|
}
|
|
|
|
function _returnToIdle() {
|
|
if (!idleClipUrl) return;
|
|
avatarVideo.onended = null;
|
|
avatarVideo.loop = false;
|
|
avatarVideo.muted = true;
|
|
avatarVideo.src = idleClipUrl;
|
|
avatarVideo.play().catch(() => {});
|
|
}
|
|
|
|
function playSpeakingClip(arrayBuffer, meta) {
|
|
const blob = new Blob([arrayBuffer], { type: "video/mp4" });
|
|
const blobUrl = URL.createObjectURL(blob);
|
|
|
|
if (currentSpeakingClipBlobUrl !== null) {
|
|
// A clip is already playing — queue this one.
|
|
speakingClipQueue.push({ blobUrl, meta });
|
|
} else {
|
|
_startSpeakingClip(blobUrl, meta);
|
|
}
|
|
}
|
|
|
|
function _startSpeakingClip(blobUrl, meta) {
|
|
const gen = ++currentClipGeneration;
|
|
|
|
if (currentSpeakingClipBlobUrl) {
|
|
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
|
|
}
|
|
currentSpeakingClipBlobUrl = blobUrl;
|
|
|
|
avatarVideo.loop = false;
|
|
avatarVideo.muted = false;
|
|
avatarVideo.src = blobUrl;
|
|
|
|
if (meta && meta.text) {
|
|
appendAssistantText(meta.text);
|
|
}
|
|
isPlaying = true;
|
|
|
|
avatarVideo.onended = () => {
|
|
if (currentClipGeneration !== gen) return; // stale handler from a replaced clip
|
|
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
|
|
currentSpeakingClipBlobUrl = null;
|
|
|
|
const next = speakingClipQueue.shift();
|
|
if (next) {
|
|
_startSpeakingClip(next.blobUrl, next.meta);
|
|
} else {
|
|
isPlaying = false;
|
|
finalizeAssistantMessage(false);
|
|
_returnToIdle();
|
|
}
|
|
};
|
|
|
|
avatarVideo.play().catch((e) => {
|
|
console.error("speaking clip play failed:", e);
|
|
});
|
|
}
|
|
|
|
function stopSpeakingClip() {
|
|
// Discard any queued clips.
|
|
for (const { blobUrl } of speakingClipQueue) {
|
|
URL.revokeObjectURL(blobUrl);
|
|
}
|
|
speakingClipQueue = [];
|
|
currentClipGeneration++; // invalidate any in-flight onended handlers
|
|
|
|
if (!currentSpeakingClipBlobUrl) return;
|
|
try { avatarVideo.pause(); } catch (_) {}
|
|
avatarVideo.onended = null;
|
|
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
|
|
currentSpeakingClipBlobUrl = null;
|
|
isPlaying = false;
|
|
_returnToIdle();
|
|
}
|
|
|
|
async function uploadAvatar() {
|
|
const fileInput = document.getElementById("avatar-file");
|
|
const status = document.getElementById("avatar-status");
|
|
if (!fileInput.files || !fileInput.files[0]) {
|
|
status.textContent = "Pick an image first.";
|
|
return;
|
|
}
|
|
status.textContent = "Uploading and rendering idle clip (this takes a while)...";
|
|
const fd = new FormData();
|
|
fd.append("image", fileInput.files[0]);
|
|
try {
|
|
const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
|
|
if (!resp.ok) throw new Error(await resp.text());
|
|
const data = await resp.json();
|
|
idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
|
|
videoModeEnabled = true;
|
|
videoModeName = data.mode || videoModeName;
|
|
refreshStage();
|
|
status.textContent = "Avatar ready (" + data.mode + ")";
|
|
} catch (err) {
|
|
console.error(err);
|
|
status.textContent = "Failed: " + err.message;
|
|
}
|
|
}
|
|
|
|
async function applyVideoMode() {
|
|
const sel = document.getElementById("video-mode-select");
|
|
const status = document.getElementById("avatar-status");
|
|
const fd = new FormData();
|
|
fd.append("mode", sel.value);
|
|
try {
|
|
const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
|
|
if (!resp.ok) throw new Error(await resp.text());
|
|
const data = await resp.json();
|
|
videoModeName = data.mode;
|
|
if (data.mode === "off") {
|
|
videoModeEnabled = false;
|
|
stageEl.classList.remove("active");
|
|
}
|
|
status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
|
|
} catch (err) {
|
|
status.textContent = "Failed: " + err.message;
|
|
}
|
|
}
|
|
|
|
// --- Status ---
|
|
|
|
function setStatus(state) {
|
|
statusBadge.textContent =
|
|
state === "listening"
|
|
? "Listening"
|
|
: state === "thinking"
|
|
? "Thinking..."
|
|
: state === "speaking"
|
|
? "Speaking"
|
|
: state === "disconnected"
|
|
? "Disconnected"
|
|
: state;
|
|
statusBadge.className = state;
|
|
}
|
|
|
|
// --- Chat Messages ---
|
|
|
|
let currentAssistantEl = null;
|
|
let currentAssistantText = "";
|
|
|
|
function addMessage(role, text) {
|
|
const el = document.createElement("div");
|
|
el.className = `message ${role}`;
|
|
el.textContent = text;
|
|
chatArea.appendChild(el);
|
|
chatArea.scrollTop = chatArea.scrollHeight;
|
|
}
|
|
|
|
function appendAssistantText(text) {
|
|
if (!currentAssistantEl) {
|
|
currentAssistantEl = document.createElement("div");
|
|
currentAssistantEl.className = "message assistant";
|
|
chatArea.appendChild(currentAssistantEl);
|
|
currentAssistantText = "";
|
|
}
|
|
currentAssistantText += (currentAssistantText ? " " : "") + text;
|
|
currentAssistantEl.textContent = currentAssistantText;
|
|
chatArea.scrollTop = chatArea.scrollHeight;
|
|
}
|
|
|
|
function finalizeAssistantMessage(interrupted = false) {
|
|
if (interrupted && currentAssistantEl && currentAssistantText) {
|
|
const marker = document.createElement("span");
|
|
marker.className = "interrupted-marker";
|
|
marker.textContent = " [interrupted]";
|
|
currentAssistantEl.appendChild(marker);
|
|
}
|
|
currentAssistantEl = null;
|
|
currentAssistantText = "";
|
|
// Reset sync state
|
|
pendingTextChunks = [];
|
|
for (const tid of scheduledTextTimers) clearTimeout(tid);
|
|
scheduledTextTimers = [];
|
|
lastDisplayedChunkId = -1;
|
|
}
|
|
|
|
// --- Audio Playback ---
|
|
|
|
let activeSources = [];
|
|
|
|
function getPlaybackCtx() {
|
|
if (!audioCtx || audioCtx.state === "closed") {
|
|
audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
|
|
}
|
|
return audioCtx;
|
|
}
|
|
|
|
function playAudioChunk(arrayBuffer) {
|
|
const ctx = getPlaybackCtx();
|
|
const int16 = new Int16Array(arrayBuffer);
|
|
const float32 = new Float32Array(int16.length);
|
|
for (let i = 0; i < int16.length; i++) {
|
|
float32[i] = int16[i] / 32768;
|
|
}
|
|
|
|
const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
|
|
buffer.getChannelData(0).set(float32);
|
|
|
|
const source = ctx.createBufferSource();
|
|
source.buffer = buffer;
|
|
source.connect(ctx.destination);
|
|
|
|
activeSources.push(source);
|
|
isPlaying = true;
|
|
|
|
// Pair this audio chunk with the next queued text chunk
|
|
const textEntry = pendingTextChunks.shift();
|
|
|
|
const now = ctx.currentTime;
|
|
if (nextPlayTime < now) {
|
|
nextPlayTime = now + 0.01;
|
|
}
|
|
|
|
// Schedule text display to coincide with audio playback start
|
|
if (textEntry) {
|
|
const delayMs = Math.max(0, (nextPlayTime - now) * 1000);
|
|
const tid = setTimeout(() => {
|
|
appendAssistantText(textEntry.text);
|
|
lastDisplayedChunkId = textEntry.chunkId;
|
|
scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid);
|
|
}, delayMs);
|
|
scheduledTextTimers.push(tid);
|
|
}
|
|
|
|
source.onended = () => {
|
|
activeSources = activeSources.filter((s) => s !== source);
|
|
if (activeSources.length === 0) {
|
|
isPlaying = false;
|
|
bargeInCount = 0;
|
|
// If all audio has finished and no more text pending, finalize
|
|
if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
|
|
finalizeAssistantMessage(false);
|
|
}
|
|
}
|
|
};
|
|
|
|
source.start(nextPlayTime);
|
|
nextPlayTime += buffer.duration;
|
|
}
|
|
|
|
function stopPlayback() {
|
|
for (const source of activeSources) {
|
|
try {
|
|
source.stop();
|
|
} catch (_) {}
|
|
}
|
|
activeSources = [];
|
|
nextPlayTime = 0;
|
|
isPlaying = false;
|
|
bargeInCount = 0;
|
|
// Cancel any pending text displays
|
|
for (const tid of scheduledTextTimers) clearTimeout(tid);
|
|
scheduledTextTimers = [];
|
|
pendingTextChunks = [];
|
|
}
|
|
|
|
// --- Microphone ---
|
|
|
|
async function toggleMic() {
|
|
if (micActive) {
|
|
stopMic();
|
|
} else {
|
|
await startMic();
|
|
}
|
|
}
|
|
|
|
async function startMic() {
|
|
try {
|
|
// Ensure playback context exists (needed for user gesture)
|
|
getPlaybackCtx();
|
|
if (audioCtx.state === "suspended") {
|
|
await audioCtx.resume();
|
|
}
|
|
|
|
micStream = await navigator.mediaDevices.getUserMedia({
|
|
audio: {
|
|
sampleRate: MIC_SR,
|
|
channelCount: 1,
|
|
echoCancellation: true,
|
|
noiseSuppression: true,
|
|
autoGainControl: true,
|
|
},
|
|
});
|
|
|
|
// Create a separate context at 16kHz for mic capture
|
|
const micCtx = new AudioContext({ sampleRate: MIC_SR });
|
|
const source = micCtx.createMediaStreamSource(micStream);
|
|
|
|
await micCtx.audioWorklet.addModule("/static/processor.js");
|
|
workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
|
|
source.connect(workletNode);
|
|
|
|
workletNode.port.onmessage = (e) => {
|
|
if (ws && ws.readyState === WebSocket.OPEN) {
|
|
ws.send(e.data);
|
|
|
|
// Client-side barge-in: detect mic energy while playing
|
|
if (isPlaying) {
|
|
const samples = new Int16Array(e.data);
|
|
let sum = 0;
|
|
for (let i = 0; i < samples.length; i++) {
|
|
const s = samples[i] / 32768;
|
|
sum += s * s;
|
|
}
|
|
const rms = Math.sqrt(sum / samples.length);
|
|
|
|
if (rms > BARGE_IN_THRESHOLD) {
|
|
bargeInCount++;
|
|
if (bargeInCount >= BARGE_IN_FRAMES) {
|
|
// User is speaking over the assistant - interrupt
|
|
stopPlayback();
|
|
stopSpeakingClip();
|
|
const msg = { type: "interrupt" };
|
|
if (lastDisplayedChunkId >= 0) {
|
|
msg.last_chunk_id = lastDisplayedChunkId;
|
|
}
|
|
ws.send(JSON.stringify(msg));
|
|
finalizeAssistantMessage(true);
|
|
isPlaying = false;
|
|
bargeInCount = 0;
|
|
}
|
|
} else {
|
|
bargeInCount = 0;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
// Store for cleanup
|
|
workletNode._micCtx = micCtx;
|
|
|
|
micActive = true;
|
|
micBtn.classList.add("active");
|
|
|
|
// Connect WebSocket if not already
|
|
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
|
connectWS();
|
|
}
|
|
} catch (err) {
|
|
console.error("Mic access failed:", err);
|
|
alert("Could not access microphone. Please allow mic permissions.");
|
|
}
|
|
}
|
|
|
|
function stopMic() {
|
|
if (workletNode) {
|
|
workletNode.disconnect();
|
|
if (workletNode._micCtx) {
|
|
workletNode._micCtx.close();
|
|
}
|
|
workletNode = null;
|
|
}
|
|
if (micStream) {
|
|
micStream.getTracks().forEach((t) => t.stop());
|
|
micStream = null;
|
|
}
|
|
micActive = false;
|
|
micBtn.classList.remove("active");
|
|
}
|
|
|
|
// --- Voice Selection ---
|
|
|
|
async function applyVoice() {
|
|
const voice = document.getElementById("voice-select").value;
|
|
const statusEl = document.getElementById("voice-status");
|
|
|
|
const formData = new FormData();
|
|
formData.append("voice", voice);
|
|
formData.append("lang", "a");
|
|
|
|
statusEl.textContent = "Applying...";
|
|
try {
|
|
const resp = await fetch("/api/set-voice", {
|
|
method: "POST",
|
|
body: formData,
|
|
});
|
|
const data = await resp.json();
|
|
if (data.status === "ok") {
|
|
statusEl.textContent = "Voice: " + voice;
|
|
} else {
|
|
statusEl.textContent = "Failed.";
|
|
}
|
|
} catch (err) {
|
|
statusEl.textContent = "Error: " + err.message;
|
|
}
|
|
}
|
|
|
|
// Expose to HTML onclick
|
|
window.toggleMic = toggleMic;
|
|
window.applyVoice = applyVoice;
|
|
window.uploadAvatar = uploadAvatar;
|
|
window.applyVideoMode = applyVideoMode;
|