first stab at adding video

This commit is contained in:
2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
+160 -1
View File
@@ -18,9 +18,18 @@ let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user
// --- Video mode state ---
let videoModeEnabled = false; // true when server has video engine active AND ready
let videoModeName = "off"; // "off" | "library" | "reflective"
let idleClipUrl = null; // URL string (server-served) or null
let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
let currentSpeakingClipBlobUrl = null;
const chatArea = document.getElementById("chat-area");
const statusBadge = document.getElementById("status-badge");
const micBtn = document.getElementById("mic-btn");
const avatarVideo = document.getElementById("avatar-video");
const stageEl = document.getElementById("stage");
// --- WebSocket ---
@@ -44,7 +53,18 @@ function connectWS() {
ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
playAudioChunk(event.data);
// In video mode, the next binary frame after a "speaking_clip"
// envelope is an MP4 blob; otherwise it's a PCM audio chunk.
if (pendingSpeakingClipMeta) {
const meta = pendingSpeakingClipMeta;
pendingSpeakingClipMeta = null;
playSpeakingClip(event.data, meta);
} else if (videoModeEnabled) {
// Video mode is active but we didn't get a speaking_clip envelope
// first — ignore raw PCM so we don't double-play audio.
} else {
playAudioChunk(event.data);
}
} else {
handleJSON(JSON.parse(event.data));
}
@@ -59,6 +79,7 @@ function handleJSON(msg) {
case "interrupt":
stopPlayback();
stopSpeakingClip();
// Finalize with interrupted marker — text already reflects only what was heard
finalizeAssistantMessage(true);
break;
@@ -80,6 +101,141 @@ function handleJSON(msg) {
pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
}
break;
case "video_mode":
// Sent once on WS open. Toggles the video element + speaking-clip path.
applyVideoModeState(msg);
break;
case "speaking_clip":
// Envelope preceding an MP4 binary frame with the full turn.
pendingSpeakingClipMeta = {
chunk_id: msg.chunk_id,
duration_ms: msg.duration_ms,
text: msg.text,
};
break;
}
}
// --- Video mode ------------------------------------------------------------
function applyVideoModeState(msg) {
videoModeEnabled = !!msg.enabled && !!msg.ready;
videoModeName = msg.mode || "off";
idleClipUrl = msg.idle_clip_url || null;
refreshStage();
}
function refreshStage() {
if (videoModeEnabled && idleClipUrl) {
stageEl.classList.add("active");
if (avatarVideo.src !== location.origin + idleClipUrl) {
avatarVideo.src = idleClipUrl;
avatarVideo.loop = true;
avatarVideo.muted = true;
avatarVideo.play().catch(() => {});
}
} else {
stageEl.classList.remove("active");
}
}
function playSpeakingClip(arrayBuffer, meta) {
// Replace the idle loop with the speaking clip.
stopSpeakingClip();
const blob = new Blob([arrayBuffer], { type: "video/mp4" });
currentSpeakingClipBlobUrl = URL.createObjectURL(blob);
avatarVideo.loop = false;
avatarVideo.muted = false;
avatarVideo.src = currentSpeakingClipBlobUrl;
// Show the full reply text now — the MP4 plays it in one shot so there's
// no per-chunk sync to do.
if (meta && meta.text) {
appendAssistantText(meta.text);
}
isPlaying = true;
avatarVideo.onended = () => {
isPlaying = false;
finalizeAssistantMessage(false);
// Return to idle loop.
if (idleClipUrl) {
avatarVideo.loop = true;
avatarVideo.muted = true;
avatarVideo.src = idleClipUrl;
avatarVideo.play().catch(() => {});
}
if (currentSpeakingClipBlobUrl) {
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
currentSpeakingClipBlobUrl = null;
}
};
avatarVideo.play().catch((e) => {
console.error("speaking clip play failed:", e);
});
}
function stopSpeakingClip() {
if (!currentSpeakingClipBlobUrl) return;
try {
avatarVideo.pause();
} catch (_) {}
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
currentSpeakingClipBlobUrl = null;
if (idleClipUrl) {
avatarVideo.loop = true;
avatarVideo.muted = true;
avatarVideo.src = idleClipUrl;
avatarVideo.play().catch(() => {});
}
isPlaying = false;
}
async function uploadAvatar() {
const fileInput = document.getElementById("avatar-file");
const status = document.getElementById("avatar-status");
if (!fileInput.files || !fileInput.files[0]) {
status.textContent = "Pick an image first.";
return;
}
status.textContent = "Uploading and rendering idle clip (this takes a while)...";
const fd = new FormData();
fd.append("image", fileInput.files[0]);
try {
const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
if (!resp.ok) throw new Error(await resp.text());
const data = await resp.json();
idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
videoModeEnabled = true;
videoModeName = data.mode || videoModeName;
refreshStage();
status.textContent = "Avatar ready (" + data.mode + ")";
} catch (err) {
console.error(err);
status.textContent = "Failed: " + err.message;
}
}
async function applyVideoMode() {
const sel = document.getElementById("video-mode-select");
const status = document.getElementById("avatar-status");
const fd = new FormData();
fd.append("mode", sel.value);
try {
const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
if (!resp.ok) throw new Error(await resp.text());
const data = await resp.json();
videoModeName = data.mode;
if (data.mode === "off") {
videoModeEnabled = false;
stageEl.classList.remove("active");
}
status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
} catch (err) {
status.textContent = "Failed: " + err.message;
}
}
@@ -275,6 +431,7 @@ async function startMic() {
if (bargeInCount >= BARGE_IN_FRAMES) {
// User is speaking over the assistant - interrupt
stopPlayback();
stopSpeakingClip();
const msg = { type: "interrupt" };
if (lastDisplayedChunkId >= 0) {
msg.last_chunk_id = lastDisplayedChunkId;
@@ -353,3 +510,5 @@ async function applyVoice() {
// Expose to HTML onclick
window.toggleMic = toggleMic;
window.applyVoice = applyVoice;
window.uploadAvatar = uploadAvatar;
window.applyVideoMode = applyVideoMode;