live-voice-chat/static/app.js

// --- State ---
let ws = null;
let audioCtx = null;
let micStream = null;
let workletNode = null;
let micActive = false;
let nextPlayTime = 0;
let isPlaying = false;

const PLAYBACK_SR = 24000; // TTS output sample rate
const MIC_SR = 16000;
const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
let bargeInCount = 0;

// --- Text-audio sync state ---
let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user

// --- Video mode state ---
let videoModeEnabled = false;   // true when server has video engine active AND ready
let videoModeName = "off";      // "off" | "library" | "reflective"
let idleClipUrl = null;         // URL string (server-served) or null
let pendingSpeakingClipMeta = null; // {chunk_id, duration_ms, text} waiting for MP4 binary
let currentSpeakingClipBlobUrl = null;
let speakingClipQueue = [];     // [{blobUrl, meta}] clips waiting to play
let currentClipGeneration = 0; // incremented each clip start; guards stale onended handlers

const chatArea = document.getElementById("chat-area");
const statusBadge = document.getElementById("status-badge");
const micBtn = document.getElementById("mic-btn");
const avatarVideo = document.getElementById("avatar-video");
const stageEl = document.getElementById("stage");

// --- WebSocket ---

function connectWS() {
  const proto = location.protocol === "https:" ? "wss:" : "ws:";
  ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
  ws.binaryType = "arraybuffer";

  ws.onopen = () => {
    setStatus("listening");
  };

  ws.onclose = () => {
    setStatus("disconnected");
    setTimeout(connectWS, 2000);
  };

  ws.onerror = () => {
    ws.close();
  };

  ws.onmessage = (event) => {
    if (event.data instanceof ArrayBuffer) {
      // In video mode, the next binary frame after a "speaking_clip"
      // envelope is an MP4 blob; otherwise it's a PCM audio chunk.
      if (pendingSpeakingClipMeta) {
        const meta = pendingSpeakingClipMeta;
        pendingSpeakingClipMeta = null;
        playSpeakingClip(event.data, meta);
      } else if (videoModeEnabled) {
        // Video mode is active but we didn't get a speaking_clip envelope
        // first — ignore raw PCM so we don't double-play audio.
      } else {
        playAudioChunk(event.data);
      }
    } else {
      handleJSON(JSON.parse(event.data));
    }
  };
}

function handleJSON(msg) {
  switch (msg.type) {
    case "status":
      setStatus(msg.state);
      break;

    case "interrupt":
      stopPlayback();
      stopSpeakingClip();
      // Finalize with interrupted marker — text already reflects only what was heard
      finalizeAssistantMessage(true);
      break;

    case "transcript":
      addMessage("user", msg.text);
      break;

    case "response_text":
      if (msg.final) {
        // All chunks sent; finalize will happen when last audio chunk plays
        // (or immediately if nothing was queued)
        if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
          finalizeAssistantMessage(false);
        }
        // Otherwise, playAudioChunk will finalize after the last scheduled text
      } else {
        // Queue text — it will be displayed when corresponding audio starts playing
        pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
      }
      break;

    case "video_mode":
      // Sent once on WS open. Toggles the video element + speaking-clip path.
      applyVideoModeState(msg);
      break;

    case "speaking_clip":
      // Envelope preceding an MP4 binary frame with the full turn.
      pendingSpeakingClipMeta = {
        chunk_id: msg.chunk_id,
        duration_ms: msg.duration_ms,
        text: msg.text,
      };
      break;
  }
}

// --- Video mode ------------------------------------------------------------

function applyVideoModeState(msg) {
  videoModeEnabled = !!msg.enabled && !!msg.ready;
  videoModeName = msg.mode || "off";
  idleClipUrl = msg.idle_clip_url || null;
  refreshStage();
}

function refreshStage() {
  if (videoModeEnabled && idleClipUrl) {
    stageEl.classList.add("active");
    if (avatarVideo.src !== location.origin + idleClipUrl) {
      _returnToIdle();
    }
  } else {
    stageEl.classList.remove("active");
  }
}

function _returnToIdle() {
  if (!idleClipUrl) return;
  avatarVideo.onended = null;
  avatarVideo.loop = false;
  avatarVideo.muted = true;
  avatarVideo.src = idleClipUrl;
  avatarVideo.play().catch(() => {});
}

function playSpeakingClip(arrayBuffer, meta) {
  const blob = new Blob([arrayBuffer], { type: "video/mp4" });
  const blobUrl = URL.createObjectURL(blob);

  if (currentSpeakingClipBlobUrl !== null) {
    // A clip is already playing — queue this one.
    speakingClipQueue.push({ blobUrl, meta });
  } else {
    _startSpeakingClip(blobUrl, meta);
  }
}

function _startSpeakingClip(blobUrl, meta) {
  const gen = ++currentClipGeneration;

  if (currentSpeakingClipBlobUrl) {
    URL.revokeObjectURL(currentSpeakingClipBlobUrl);
  }
  currentSpeakingClipBlobUrl = blobUrl;

  avatarVideo.loop = false;
  avatarVideo.muted = false;
  avatarVideo.src = blobUrl;

  if (meta && meta.text) {
    appendAssistantText(meta.text);
  }
  isPlaying = true;

  avatarVideo.onended = () => {
    if (currentClipGeneration !== gen) return; // stale handler from a replaced clip
    URL.revokeObjectURL(currentSpeakingClipBlobUrl);
    currentSpeakingClipBlobUrl = null;

    const next = speakingClipQueue.shift();
    if (next) {
      _startSpeakingClip(next.blobUrl, next.meta);
    } else {
      isPlaying = false;
      finalizeAssistantMessage(false);
      _returnToIdle();
    }
  };

  avatarVideo.play().catch((e) => {
    console.error("speaking clip play failed:", e);
  });
}

function stopSpeakingClip() {
  // Discard any queued clips.
  for (const { blobUrl } of speakingClipQueue) {
    URL.revokeObjectURL(blobUrl);
  }
  speakingClipQueue = [];
  currentClipGeneration++; // invalidate any in-flight onended handlers

  if (!currentSpeakingClipBlobUrl) return;
  try { avatarVideo.pause(); } catch (_) {}
  avatarVideo.onended = null;
  URL.revokeObjectURL(currentSpeakingClipBlobUrl);
  currentSpeakingClipBlobUrl = null;
  isPlaying = false;
  _returnToIdle();
}

async function uploadAvatar() {
  const fileInput = document.getElementById("avatar-file");
  const status = document.getElementById("avatar-status");
  if (!fileInput.files || !fileInput.files[0]) {
    status.textContent = "Pick an image first.";
    return;
  }
  status.textContent = "Uploading and rendering idle clip (this takes a while)...";
  const fd = new FormData();
  fd.append("image", fileInput.files[0]);
  try {
    const resp = await fetch("/api/set-avatar", { method: "POST", body: fd });
    if (!resp.ok) throw new Error(await resp.text());
    const data = await resp.json();
    idleClipUrl = data.idle_clip_url + "?t=" + Date.now(); // cache-bust
    videoModeEnabled = true;
    videoModeName = data.mode || videoModeName;
    refreshStage();
    status.textContent = "Avatar ready (" + data.mode + ")";
  } catch (err) {
    console.error(err);
    status.textContent = "Failed: " + err.message;
  }
}

async function applyVideoMode() {
  const sel = document.getElementById("video-mode-select");
  const status = document.getElementById("avatar-status");
  const fd = new FormData();
  fd.append("mode", sel.value);
  try {
    const resp = await fetch("/api/set-video-mode", { method: "POST", body: fd });
    if (!resp.ok) throw new Error(await resp.text());
    const data = await resp.json();
    videoModeName = data.mode;
    if (data.mode === "off") {
      videoModeEnabled = false;
      stageEl.classList.remove("active");
    }
    status.textContent = "Mode: " + data.mode + (data.note ? " — " + data.note : "");
  } catch (err) {
    status.textContent = "Failed: " + err.message;
  }
}

// --- Status ---

function setStatus(state) {
  statusBadge.textContent =
    state === "listening"
      ? "Listening"
      : state === "thinking"
      ? "Thinking..."
      : state === "speaking"
      ? "Speaking"
      : state === "disconnected"
      ? "Disconnected"
      : state;
  statusBadge.className = state;
}

// --- Chat Messages ---

let currentAssistantEl = null;
let currentAssistantText = "";

function addMessage(role, text) {
  const el = document.createElement("div");
  el.className = `message ${role}`;
  el.textContent = text;
  chatArea.appendChild(el);
  chatArea.scrollTop = chatArea.scrollHeight;
}

function appendAssistantText(text) {
  if (!currentAssistantEl) {
    currentAssistantEl = document.createElement("div");
    currentAssistantEl.className = "message assistant";
    chatArea.appendChild(currentAssistantEl);
    currentAssistantText = "";
  }
  currentAssistantText += (currentAssistantText ? " " : "") + text;
  currentAssistantEl.textContent = currentAssistantText;
  chatArea.scrollTop = chatArea.scrollHeight;
}

function finalizeAssistantMessage(interrupted = false) {
  if (interrupted && currentAssistantEl && currentAssistantText) {
    const marker = document.createElement("span");
    marker.className = "interrupted-marker";
    marker.textContent = " [interrupted]";
    currentAssistantEl.appendChild(marker);
  }
  currentAssistantEl = null;
  currentAssistantText = "";
  // Reset sync state
  pendingTextChunks = [];
  for (const tid of scheduledTextTimers) clearTimeout(tid);
  scheduledTextTimers = [];
  lastDisplayedChunkId = -1;
}

// --- Audio Playback ---

let activeSources = [];

function getPlaybackCtx() {
  if (!audioCtx || audioCtx.state === "closed") {
    audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
  }
  return audioCtx;
}

function playAudioChunk(arrayBuffer) {
  const ctx = getPlaybackCtx();
  const int16 = new Int16Array(arrayBuffer);
  const float32 = new Float32Array(int16.length);
  for (let i = 0; i < int16.length; i++) {
    float32[i] = int16[i] / 32768;
  }

  const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
  buffer.getChannelData(0).set(float32);

  const source = ctx.createBufferSource();
  source.buffer = buffer;
  source.connect(ctx.destination);

  activeSources.push(source);
  isPlaying = true;

  // Pair this audio chunk with the next queued text chunk
  const textEntry = pendingTextChunks.shift();

  const now = ctx.currentTime;
  if (nextPlayTime < now) {
    nextPlayTime = now + 0.01;
  }

  // Schedule text display to coincide with audio playback start
  if (textEntry) {
    const delayMs = Math.max(0, (nextPlayTime - now) * 1000);
    const tid = setTimeout(() => {
      appendAssistantText(textEntry.text);
      lastDisplayedChunkId = textEntry.chunkId;
      scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid);
    }, delayMs);
    scheduledTextTimers.push(tid);
  }

  source.onended = () => {
    activeSources = activeSources.filter((s) => s !== source);
    if (activeSources.length === 0) {
      isPlaying = false;
      bargeInCount = 0;
      // If all audio has finished and no more text pending, finalize
      if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
        finalizeAssistantMessage(false);
      }
    }
  };

  source.start(nextPlayTime);
  nextPlayTime += buffer.duration;
}

function stopPlayback() {
  for (const source of activeSources) {
    try {
      source.stop();
    } catch (_) {}
  }
  activeSources = [];
  nextPlayTime = 0;
  isPlaying = false;
  bargeInCount = 0;
  // Cancel any pending text displays
  for (const tid of scheduledTextTimers) clearTimeout(tid);
  scheduledTextTimers = [];
  pendingTextChunks = [];
}

// --- Microphone ---

async function toggleMic() {
  if (micActive) {
    stopMic();
  } else {
    await startMic();
  }
}

async function startMic() {
  try {
    // Ensure playback context exists (needed for user gesture)
    getPlaybackCtx();
    if (audioCtx.state === "suspended") {
      await audioCtx.resume();
    }

    micStream = await navigator.mediaDevices.getUserMedia({
      audio: {
        sampleRate: MIC_SR,
        channelCount: 1,
        echoCancellation: true,
        noiseSuppression: true,
        autoGainControl: true,
      },
    });

    // Create a separate context at 16kHz for mic capture
    const micCtx = new AudioContext({ sampleRate: MIC_SR });
    const source = micCtx.createMediaStreamSource(micStream);

    await micCtx.audioWorklet.addModule("/static/processor.js");
    workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
    source.connect(workletNode);

    workletNode.port.onmessage = (e) => {
      if (ws && ws.readyState === WebSocket.OPEN) {
        ws.send(e.data);

        // Client-side barge-in: detect mic energy while playing
        if (isPlaying) {
          const samples = new Int16Array(e.data);
          let sum = 0;
          for (let i = 0; i < samples.length; i++) {
            const s = samples[i] / 32768;
            sum += s * s;
          }
          const rms = Math.sqrt(sum / samples.length);

          if (rms > BARGE_IN_THRESHOLD) {
            bargeInCount++;
            if (bargeInCount >= BARGE_IN_FRAMES) {
              // User is speaking over the assistant - interrupt
              stopPlayback();
              stopSpeakingClip();
              const msg = { type: "interrupt" };
              if (lastDisplayedChunkId >= 0) {
                msg.last_chunk_id = lastDisplayedChunkId;
              }
              ws.send(JSON.stringify(msg));
              finalizeAssistantMessage(true);
              isPlaying = false;
              bargeInCount = 0;
            }
          } else {
            bargeInCount = 0;
          }
        }
      }
    };

    // Store for cleanup
    workletNode._micCtx = micCtx;

    micActive = true;
    micBtn.classList.add("active");

    // Connect WebSocket if not already
    if (!ws || ws.readyState !== WebSocket.OPEN) {
      connectWS();
    }
  } catch (err) {
    console.error("Mic access failed:", err);
    alert("Could not access microphone. Please allow mic permissions.");
  }
}

function stopMic() {
  if (workletNode) {
    workletNode.disconnect();
    if (workletNode._micCtx) {
      workletNode._micCtx.close();
    }
    workletNode = null;
  }
  if (micStream) {
    micStream.getTracks().forEach((t) => t.stop());
    micStream = null;
  }
  micActive = false;
  micBtn.classList.remove("active");
}

// --- Voice Selection ---

async function applyVoice() {
  const voice = document.getElementById("voice-select").value;
  const statusEl = document.getElementById("voice-status");

  const formData = new FormData();
  formData.append("voice", voice);
  formData.append("lang", "a");

  statusEl.textContent = "Applying...";
  try {
    const resp = await fetch("/api/set-voice", {
      method: "POST",
      body: formData,
    });
    const data = await resp.json();
    if (data.status === "ok") {
      statusEl.textContent = "Voice: " + voice;
    } else {
      statusEl.textContent = "Failed.";
    }
  } catch (err) {
    statusEl.textContent = "Error: " + err.message;
  }
}

// Expose to HTML onclick
window.toggleMic = toggleMic;
window.applyVoice = applyVoice;
window.uploadAvatar = uploadAvatar;
window.applyVideoMode = applyVideoMode;