first stab at adding video
This commit is contained in:
+160
-1
@@ -18,9 +18,18 @@ let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to
|
||||
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
|
||||
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user
|
||||
|
||||
// --- Video mode state ---
|
||||
let videoModeEnabled = false; // true when server has video engine active AND ready
|
||||
let videoModeName = "off"; // "off" | "library" | "reflective"
|
||||
let idleClipUrl = null; // URL string (server-served) or null
|
||||
let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
|
||||
let currentSpeakingClipBlobUrl = null;
|
||||
|
||||
const chatArea = document.getElementById("chat-area");
|
||||
const statusBadge = document.getElementById("status-badge");
|
||||
const micBtn = document.getElementById("mic-btn");
|
||||
const avatarVideo = document.getElementById("avatar-video");
|
||||
const stageEl = document.getElementById("stage");
|
||||
|
||||
// --- WebSocket ---
|
||||
|
||||
@@ -44,7 +53,18 @@ function connectWS() {
|
||||
|
||||
ws.onmessage = (event) => {
|
||||
if (event.data instanceof ArrayBuffer) {
|
||||
playAudioChunk(event.data);
|
||||
// In video mode, the next binary frame after a "speaking_clip"
|
||||
// envelope is an MP4 blob; otherwise it's a PCM audio chunk.
|
||||
if (pendingSpeakingClipMeta) {
|
||||
const meta = pendingSpeakingClipMeta;
|
||||
pendingSpeakingClipMeta = null;
|
||||
playSpeakingClip(event.data, meta);
|
||||
} else if (videoModeEnabled) {
|
||||
// Video mode is active but we didn't get a speaking_clip envelope
|
||||
// first — ignore raw PCM so we don't double-play audio.
|
||||
} else {
|
||||
playAudioChunk(event.data);
|
||||
}
|
||||
} else {
|
||||
handleJSON(JSON.parse(event.data));
|
||||
}
|
||||
@@ -59,6 +79,7 @@ function handleJSON(msg) {
|
||||
|
||||
case "interrupt":
|
||||
stopPlayback();
|
||||
stopSpeakingClip();
|
||||
// Finalize with interrupted marker — text already reflects only what was heard
|
||||
finalizeAssistantMessage(true);
|
||||
break;
|
||||
@@ -80,6 +101,141 @@ function handleJSON(msg) {
|
||||
pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
|
||||
}
|
||||
break;
|
||||
|
||||
case "video_mode":
|
||||
// Sent once on WS open. Toggles the video element + speaking-clip path.
|
||||
applyVideoModeState(msg);
|
||||
break;
|
||||
|
||||
case "speaking_clip":
|
||||
// Envelope preceding an MP4 binary frame with the full turn.
|
||||
pendingSpeakingClipMeta = {
|
||||
chunk_id: msg.chunk_id,
|
||||
duration_ms: msg.duration_ms,
|
||||
text: msg.text,
|
||||
};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Video mode ------------------------------------------------------------
|
||||
|
||||
function applyVideoModeState(msg) {
|
||||
videoModeEnabled = !!msg.enabled && !!msg.ready;
|
||||
videoModeName = msg.mode || "off";
|
||||
idleClipUrl = msg.idle_clip_url || null;
|
||||
refreshStage();
|
||||
}
|
||||
|
||||
function refreshStage() {
|
||||
if (videoModeEnabled && idleClipUrl) {
|
||||
stageEl.classList.add("active");
|
||||
if (avatarVideo.src !== location.origin + idleClipUrl) {
|
||||
avatarVideo.src = idleClipUrl;
|
||||
avatarVideo.loop = true;
|
||||
avatarVideo.muted = true;
|
||||
avatarVideo.play().catch(() => {});
|
||||
}
|
||||
} else {
|
||||
stageEl.classList.remove("active");
|
||||
}
|
||||
}
|
||||
|
||||
function playSpeakingClip(arrayBuffer, meta) {
|
||||
// Replace the idle loop with the speaking clip.
|
||||
stopSpeakingClip();
|
||||
const blob = new Blob([arrayBuffer], { type: "video/mp4" });
|
||||
currentSpeakingClipBlobUrl = URL.createObjectURL(blob);
|
||||
|
||||
avatarVideo.loop = false;
|
||||
avatarVideo.muted = false;
|
||||
avatarVideo.src = currentSpeakingClipBlobUrl;
|
||||
|
||||
// Show the full reply text now — the MP4 plays it in one shot so there's
|
||||
// no per-chunk sync to do.
|
||||
if (meta && meta.text) {
|
||||
appendAssistantText(meta.text);
|
||||
}
|
||||
isPlaying = true;
|
||||
|
||||
avatarVideo.onended = () => {
|
||||
isPlaying = false;
|
||||
finalizeAssistantMessage(false);
|
||||
// Return to idle loop.
|
||||
if (idleClipUrl) {
|
||||
avatarVideo.loop = true;
|
||||
avatarVideo.muted = true;
|
||||
avatarVideo.src = idleClipUrl;
|
||||
avatarVideo.play().catch(() => {});
|
||||
}
|
||||
if (currentSpeakingClipBlobUrl) {
|
||||
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
|
||||
currentSpeakingClipBlobUrl = null;
|
||||
}
|
||||
};
|
||||
avatarVideo.play().catch((e) => {
|
||||
console.error("speaking clip play failed:", e);
|
||||
});
|
||||
}
|
||||
|
||||
function stopSpeakingClip() {
|
||||
if (!currentSpeakingClipBlobUrl) return;
|
||||
try {
|
||||
avatarVideo.pause();
|
||||
} catch (_) {}
|
||||
URL.revokeObjectURL(currentSpeakingClipBlobUrl);
|
||||
currentSpeakingClipBlobUrl = null;
|
||||
if (idleClipUrl) {
|
||||
avatarVideo.loop = true;
|
||||
avatarVideo.muted = true;
|
||||
avatarVideo.src = idleClipUrl;
|
||||
avatarVideo.play().catch(() => {});
|
||||
}
|
||||
isPlaying = false;
|
||||
}
|
||||
|
||||
async function uploadAvatar() {
|
||||
const fileInput = document.getElementById("avatar-file");
|
||||
const status = document.getElementById("avatar-status");
|
||||
if (!fileInput.files || !fileInput.files[0]) {
|
||||
status.textContent = "Pick an image first.";
|
||||
return;
|
||||
}
|
||||
status.textContent = "Uploading and rendering idle clip (this takes a while)...";
|
||||
const fd = new FormData();
|
||||
fd.append("image", fileInput.files[0]);
|
||||
try {
|
||||
const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
|
||||
if (!resp.ok) throw new Error(await resp.text());
|
||||
const data = await resp.json();
|
||||
idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
|
||||
videoModeEnabled = true;
|
||||
videoModeName = data.mode || videoModeName;
|
||||
refreshStage();
|
||||
status.textContent = "Avatar ready (" + data.mode + ")";
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
status.textContent = "Failed: " + err.message;
|
||||
}
|
||||
}
|
||||
|
||||
async function applyVideoMode() {
|
||||
const sel = document.getElementById("video-mode-select");
|
||||
const status = document.getElementById("avatar-status");
|
||||
const fd = new FormData();
|
||||
fd.append("mode", sel.value);
|
||||
try {
|
||||
const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
|
||||
if (!resp.ok) throw new Error(await resp.text());
|
||||
const data = await resp.json();
|
||||
videoModeName = data.mode;
|
||||
if (data.mode === "off") {
|
||||
videoModeEnabled = false;
|
||||
stageEl.classList.remove("active");
|
||||
}
|
||||
status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
|
||||
} catch (err) {
|
||||
status.textContent = "Failed: " + err.message;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -275,6 +431,7 @@ async function startMic() {
|
||||
if (bargeInCount >= BARGE_IN_FRAMES) {
|
||||
// User is speaking over the assistant - interrupt
|
||||
stopPlayback();
|
||||
stopSpeakingClip();
|
||||
const msg = { type: "interrupt" };
|
||||
if (lastDisplayedChunkId >= 0) {
|
||||
msg.last_chunk_id = lastDisplayedChunkId;
|
||||
@@ -353,3 +510,5 @@ async function applyVoice() {
|
||||
// Expose to HTML onclick
|
||||
window.toggleMic = toggleMic;
|
||||
window.applyVoice = applyVoice;
|
||||
window.uploadAvatar = uploadAvatar;
|
||||
window.applyVideoMode = applyVideoMode;
|
||||
|
||||
@@ -12,6 +12,17 @@
|
||||
<span id="status-badge">Disconnected</span>
|
||||
</header>
|
||||
|
||||
<div id="stage">
|
||||
<video
|
||||
id="avatar-video"
|
||||
autoplay
|
||||
muted
|
||||
loop
|
||||
playsinline
|
||||
preload="auto"
|
||||
></video>
|
||||
</div>
|
||||
|
||||
<div id="chat-area"></div>
|
||||
|
||||
<details id="voice-panel">
|
||||
@@ -40,6 +51,27 @@
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<details id="avatar-panel">
|
||||
<summary>Avatar / Video</summary>
|
||||
<div class="panel-content">
|
||||
<label>
|
||||
Avatar image
|
||||
<input type="file" id="avatar-file" accept="image/*" />
|
||||
</label>
|
||||
<button id="upload-avatar-btn" onclick="uploadAvatar()">Upload</button>
|
||||
<label>
|
||||
Mode
|
||||
<select id="video-mode-select">
|
||||
<option value="off">Off</option>
|
||||
<option value="library">Library (pre-baked)</option>
|
||||
<option value="reflective" selected>Reflective (per-turn)</option>
|
||||
</select>
|
||||
</label>
|
||||
<button id="apply-mode-btn" onclick="applyVideoMode()">Apply mode</button>
|
||||
<span id="avatar-status"></span>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<div id="controls">
|
||||
<button id="mic-btn" onclick="toggleMic()">🎤</button>
|
||||
</div>
|
||||
|
||||
+39
-4
@@ -52,6 +52,28 @@ header h1 {
|
||||
color: #a78bfa;
|
||||
}
|
||||
|
||||
#stage {
|
||||
display: none; /* toggled on when video mode is enabled */
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
padding: 16px 24px 0;
|
||||
background: #0a0a0a;
|
||||
}
|
||||
|
||||
#stage.active {
|
||||
display: flex;
|
||||
}
|
||||
|
||||
#avatar-video {
|
||||
width: 100%;
|
||||
max-width: 480px;
|
||||
aspect-ratio: 16 / 9;
|
||||
background: #000;
|
||||
border-radius: 12px;
|
||||
object-fit: cover;
|
||||
box-shadow: 0 8px 24px rgba(0, 0, 0, 0.4);
|
||||
}
|
||||
|
||||
#chat-area {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
@@ -130,21 +152,34 @@ header h1 {
|
||||
50% { box-shadow: 0 0 0 12px rgba(239, 68, 68, 0); }
|
||||
}
|
||||
|
||||
/* Voice clone panel */
|
||||
#voice-panel {
|
||||
/* Voice + avatar panels */
|
||||
#voice-panel,
|
||||
#avatar-panel {
|
||||
padding: 12px 24px;
|
||||
border-top: 1px solid #222;
|
||||
background: #0a0a0a;
|
||||
}
|
||||
|
||||
#voice-panel summary {
|
||||
#voice-panel select,
|
||||
#avatar-panel select {
|
||||
background: #1a1a1a;
|
||||
border: 1px solid #333;
|
||||
border-radius: 6px;
|
||||
padding: 6px 10px;
|
||||
color: #e0e0e0;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
#voice-panel summary,
|
||||
#avatar-panel summary {
|
||||
cursor: pointer;
|
||||
font-size: 13px;
|
||||
color: #888;
|
||||
user-select: none;
|
||||
}
|
||||
|
||||
#voice-panel .panel-content {
|
||||
#voice-panel .panel-content,
|
||||
#avatar-panel .panel-content {
|
||||
margin-top: 12px;
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
|
||||
Reference in New Issue
Block a user