first stab at adding video

2026-04-12 04:11:52 -04:00
parent 680c5b04cc
commit 2818b41004
37 changed files with 2982 additions and 24 deletions
@@ -18,9 +18,18 @@ let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to
 let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
 let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user

+// --- Video mode state ---
+let videoModeEnabled = false;   // true when server has video engine active AND ready
+let videoModeName = "off";      // "off" | "library" | "reflective"
+let idleClipUrl = null;         // URL string (server-served) or null
+let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
+let currentSpeakingClipBlobUrl = null;
+
 const chatArea = document.getElementById("chat-area");
 const statusBadge = document.getElementById("status-badge");
 const micBtn = document.getElementById("mic-btn");
+const avatarVideo = document.getElementById("avatar-video");
+const stageEl = document.getElementById("stage");

 // --- WebSocket ---

@@ -44,7 +53,18 @@ function connectWS() {

  ws.onmessage = (event) => {
    if (event.data instanceof ArrayBuffer) {
-      playAudioChunk(event.data);
+      // In video mode, the next binary frame after a "speaking_clip"
+      // envelope is an MP4 blob; otherwise it's a PCM audio chunk.
+      if (pendingSpeakingClipMeta) {
+        const meta = pendingSpeakingClipMeta;
+        pendingSpeakingClipMeta = null;
+        playSpeakingClip(event.data, meta);
+      } else if (videoModeEnabled) {
+        // Video mode is active but we didn't get a speaking_clip envelope
+        // first — ignore raw PCM so we don't double-play audio.
+      } else {
+        playAudioChunk(event.data);
+      }
    } else {
      handleJSON(JSON.parse(event.data));
    }
@@ -59,6 +79,7 @@ function handleJSON(msg) {

    case "interrupt":
      stopPlayback();
+      stopSpeakingClip();
      // Finalize with interrupted marker — text already reflects only what was heard
      finalizeAssistantMessage(true);
      break;
@@ -80,6 +101,141 @@ function handleJSON(msg) {
        pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
      }
      break;
+
+    case "video_mode":
+      // Sent once on WS open. Toggles the video element + speaking-clip path.
+      applyVideoModeState(msg);
+      break;
+
+    case "speaking_clip":
+      // Envelope preceding an MP4 binary frame with the full turn.
+      pendingSpeakingClipMeta = {
+        chunk_id: msg.chunk_id,
+        duration_ms: msg.duration_ms,
+        text: msg.text,
+      };
+      break;
+  }
+}
+
+// --- Video mode ------------------------------------------------------------
+
+function applyVideoModeState(msg) {
+  videoModeEnabled = !!msg.enabled && !!msg.ready;
+  videoModeName = msg.mode || "off";
+  idleClipUrl = msg.idle_clip_url || null;
+  refreshStage();
+}
+
+function refreshStage() {
+  if (videoModeEnabled && idleClipUrl) {
+    stageEl.classList.add("active");
+    if (avatarVideo.src !== location.origin + idleClipUrl) {
+      avatarVideo.src = idleClipUrl;
+      avatarVideo.loop = true;
+      avatarVideo.muted = true;
+      avatarVideo.play().catch(() => {});
+    }
+  } else {
+    stageEl.classList.remove("active");
+  }
+}
+
+function playSpeakingClip(arrayBuffer, meta) {
+  // Replace the idle loop with the speaking clip.
+  stopSpeakingClip();
+  const blob = new Blob([arrayBuffer], { type: "video/mp4" });
+  currentSpeakingClipBlobUrl = URL.createObjectURL(blob);
+
+  avatarVideo.loop = false;
+  avatarVideo.muted = false;
+  avatarVideo.src = currentSpeakingClipBlobUrl;
+
+  // Show the full reply text now — the MP4 plays it in one shot so there's
+  // no per-chunk sync to do.
+  if (meta && meta.text) {
+    appendAssistantText(meta.text);
+  }
+  isPlaying = true;
+
+  avatarVideo.onended = () => {
+    isPlaying = false;
+    finalizeAssistantMessage(false);
+    // Return to idle loop.
+    if (idleClipUrl) {
+      avatarVideo.loop = true;
+      avatarVideo.muted = true;
+      avatarVideo.src = idleClipUrl;
+      avatarVideo.play().catch(() => {});
+    }
+    if (currentSpeakingClipBlobUrl) {
+      URL.revokeObjectURL(currentSpeakingClipBlobUrl);
+      currentSpeakingClipBlobUrl = null;
+    }
+  };
+  avatarVideo.play().catch((e) => {
+    console.error("speaking clip play failed:", e);
+  });
+}
+
+function stopSpeakingClip() {
+  if (!currentSpeakingClipBlobUrl) return;
+  try {
+    avatarVideo.pause();
+  } catch (_) {}
+  URL.revokeObjectURL(currentSpeakingClipBlobUrl);
+  currentSpeakingClipBlobUrl = null;
+  if (idleClipUrl) {
+    avatarVideo.loop = true;
+    avatarVideo.muted = true;
+    avatarVideo.src = idleClipUrl;
+    avatarVideo.play().catch(() => {});
+  }
+  isPlaying = false;
+}
+
+async function uploadAvatar() {
+  const fileInput = document.getElementById("avatar-file");
+  const status = document.getElementById("avatar-status");
+  if (!fileInput.files || !fileInput.files[0]) {
+    status.textContent = "Pick an image first.";
+    return;
+  }
+  status.textContent = "Uploading and rendering idle clip (this takes a while)...";
+  const fd = new FormData();
+  fd.append("image", fileInput.files[0]);
+  try {
+    const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
+    if (!resp.ok) throw new Error(await resp.text());
+    const data = await resp.json();
+    idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
+    videoModeEnabled = true;
+    videoModeName = data.mode || videoModeName;
+    refreshStage();
+    status.textContent = "Avatar ready (" + data.mode + ")";
+  } catch (err) {
+    console.error(err);
+    status.textContent = "Failed: " + err.message;
+  }
+}
+
+async function applyVideoMode() {
+  const sel = document.getElementById("video-mode-select");
+  const status = document.getElementById("avatar-status");
+  const fd = new FormData();
+  fd.append("mode", sel.value);
+  try {
+    const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
+    if (!resp.ok) throw new Error(await resp.text());
+    const data = await resp.json();
+    videoModeName = data.mode;
+    if (data.mode === "off") {
+      videoModeEnabled = false;
+      stageEl.classList.remove("active");
+    }
+    status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
+  } catch (err) {
+    status.textContent = "Failed: " + err.message;
  }
 }

@@ -275,6 +431,7 @@ async function startMic() {
            if (bargeInCount >= BARGE_IN_FRAMES) {
              // User is speaking over the assistant - interrupt
              stopPlayback();
+              stopSpeakingClip();
              const msg = { type: "interrupt" };
              if (lastDisplayedChunkId >= 0) {
                msg.last_chunk_id = lastDisplayedChunkId;
@@ -353,3 +510,5 @@ async function applyVoice() {
 // Expose to HTML onclick
 window.toggleMic = toggleMic;
 window.applyVoice = applyVoice;
+window.uploadAvatar = uploadAvatar;
+window.applyVideoMode = applyVideoMode;