live-voice-chat/static/app.js

// --- State ---
let ws = null;
let audioCtx = null;
let micStream = null;
let workletNode = null;
let micActive = false;
let nextPlayTime = 0;
let isPlaying = false;

const PLAYBACK_SR = 24000; // TTS output sample rate
const MIC_SR = 16000;
const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
let bargeInCount = 0;

// --- Text-audio sync state ---
let pendingTextChunks = []; // [{chunkId, text}] - text waiting for its audio to arrive
let scheduledTextTimers = []; // timer IDs for text display scheduled to match audio playback
let lastDisplayedChunkId = -1; // last chunk whose text was actually shown to the user

const chatArea = document.getElementById("chat-area");
const statusBadge = document.getElementById("status-badge");
const micBtn = document.getElementById("mic-btn");

// --- WebSocket ---

function connectWS() {
  const proto = location.protocol === "https:" ? "wss:" : "ws:";
  ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
  ws.binaryType = "arraybuffer";

  ws.onopen = () => {
    setStatus("listening");
  };

  ws.onclose = () => {
    setStatus("disconnected");
    setTimeout(connectWS, 2000);
  };

  ws.onerror = () => {
    ws.close();
  };

  ws.onmessage = (event) => {
    if (event.data instanceof ArrayBuffer) {
      playAudioChunk(event.data);
    } else {
      handleJSON(JSON.parse(event.data));
    }
  };
}

function handleJSON(msg) {
  switch (msg.type) {
    case "status":
      setStatus(msg.state);
      break;

    case "interrupt":
      stopPlayback();
      // Finalize with interrupted marker — text already reflects only what was heard
      finalizeAssistantMessage(true);
      break;

    case "transcript":
      addMessage("user", msg.text);
      break;

    case "response_text":
      if (msg.final) {
        // All chunks sent; finalize will happen when last audio chunk plays
        // (or immediately if nothing was queued)
        if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
          finalizeAssistantMessage(false);
        }
        // Otherwise, playAudioChunk will finalize after the last scheduled text
      } else {
        // Queue text — it will be displayed when corresponding audio starts playing
        pendingTextChunks.push({ chunkId: msg.chunk_id, text: msg.text });
      }
      break;
  }
}

// --- Status ---

function setStatus(state) {
  statusBadge.textContent =
    state === "listening"
      ? "Listening"
      : state === "thinking"
      ? "Thinking..."
      : state === "speaking"
      ? "Speaking"
      : state === "disconnected"
      ? "Disconnected"
      : state;
  statusBadge.className = state;
}

// --- Chat Messages ---

let currentAssistantEl = null;
let currentAssistantText = "";

function addMessage(role, text) {
  const el = document.createElement("div");
  el.className = `message ${role}`;
  el.textContent = text;
  chatArea.appendChild(el);
  chatArea.scrollTop = chatArea.scrollHeight;
}

function appendAssistantText(text) {
  if (!currentAssistantEl) {
    currentAssistantEl = document.createElement("div");
    currentAssistantEl.className = "message assistant";
    chatArea.appendChild(currentAssistantEl);
    currentAssistantText = "";
  }
  currentAssistantText += (currentAssistantText ? " " : "") + text;
  currentAssistantEl.textContent = currentAssistantText;
  chatArea.scrollTop = chatArea.scrollHeight;
}

function finalizeAssistantMessage(interrupted = false) {
  if (interrupted && currentAssistantEl && currentAssistantText) {
    const marker = document.createElement("span");
    marker.className = "interrupted-marker";
    marker.textContent = " [interrupted]";
    currentAssistantEl.appendChild(marker);
  }
  currentAssistantEl = null;
  currentAssistantText = "";
  // Reset sync state
  pendingTextChunks = [];
  for (const tid of scheduledTextTimers) clearTimeout(tid);
  scheduledTextTimers = [];
  lastDisplayedChunkId = -1;
}

// --- Audio Playback ---

let activeSources = [];

function getPlaybackCtx() {
  if (!audioCtx || audioCtx.state === "closed") {
    audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
  }
  return audioCtx;
}

function playAudioChunk(arrayBuffer) {
  const ctx = getPlaybackCtx();
  const int16 = new Int16Array(arrayBuffer);
  const float32 = new Float32Array(int16.length);
  for (let i = 0; i < int16.length; i++) {
    float32[i] = int16[i] / 32768;
  }

  const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
  buffer.getChannelData(0).set(float32);

  const source = ctx.createBufferSource();
  source.buffer = buffer;
  source.connect(ctx.destination);

  activeSources.push(source);
  isPlaying = true;

  // Pair this audio chunk with the next queued text chunk
  const textEntry = pendingTextChunks.shift();

  const now = ctx.currentTime;
  if (nextPlayTime < now) {
    nextPlayTime = now + 0.01;
  }

  // Schedule text display to coincide with audio playback start
  if (textEntry) {
    const delayMs = Math.max(0, (nextPlayTime - now) * 1000);
    const tid = setTimeout(() => {
      appendAssistantText(textEntry.text);
      lastDisplayedChunkId = textEntry.chunkId;
      scheduledTextTimers = scheduledTextTimers.filter((t) => t !== tid);
    }, delayMs);
    scheduledTextTimers.push(tid);
  }

  source.onended = () => {
    activeSources = activeSources.filter((s) => s !== source);
    if (activeSources.length === 0) {
      isPlaying = false;
      bargeInCount = 0;
      // If all audio has finished and no more text pending, finalize
      if (pendingTextChunks.length === 0 && scheduledTextTimers.length === 0) {
        finalizeAssistantMessage(false);
      }
    }
  };

  source.start(nextPlayTime);
  nextPlayTime += buffer.duration;
}

function stopPlayback() {
  for (const source of activeSources) {
    try {
      source.stop();
    } catch (_) {}
  }
  activeSources = [];
  nextPlayTime = 0;
  isPlaying = false;
  bargeInCount = 0;
  // Cancel any pending text displays
  for (const tid of scheduledTextTimers) clearTimeout(tid);
  scheduledTextTimers = [];
  pendingTextChunks = [];
}

// --- Microphone ---

async function toggleMic() {
  if (micActive) {
    stopMic();
  } else {
    await startMic();
  }
}

async function startMic() {
  try {
    // Ensure playback context exists (needed for user gesture)
    getPlaybackCtx();
    if (audioCtx.state === "suspended") {
      await audioCtx.resume();
    }

    micStream = await navigator.mediaDevices.getUserMedia({
      audio: {
        sampleRate: MIC_SR,
        channelCount: 1,
        echoCancellation: true,
        noiseSuppression: true,
        autoGainControl: true,
      },
    });

    // Create a separate context at 16kHz for mic capture
    const micCtx = new AudioContext({ sampleRate: MIC_SR });
    const source = micCtx.createMediaStreamSource(micStream);

    await micCtx.audioWorklet.addModule("/static/processor.js");
    workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
    source.connect(workletNode);

    workletNode.port.onmessage = (e) => {
      if (ws && ws.readyState === WebSocket.OPEN) {
        ws.send(e.data);

        // Client-side barge-in: detect mic energy while playing
        if (isPlaying) {
          const samples = new Int16Array(e.data);
          let sum = 0;
          for (let i = 0; i < samples.length; i++) {
            const s = samples[i] / 32768;
            sum += s * s;
          }
          const rms = Math.sqrt(sum / samples.length);

          if (rms > BARGE_IN_THRESHOLD) {
            bargeInCount++;
            if (bargeInCount >= BARGE_IN_FRAMES) {
              // User is speaking over the assistant - interrupt
              stopPlayback();
              const msg = { type: "interrupt" };
              if (lastDisplayedChunkId >= 0) {
                msg.last_chunk_id = lastDisplayedChunkId;
              }
              ws.send(JSON.stringify(msg));
              finalizeAssistantMessage(true);
              isPlaying = false;
              bargeInCount = 0;
            }
          } else {
            bargeInCount = 0;
          }
        }
      }
    };

    // Store for cleanup
    workletNode._micCtx = micCtx;

    micActive = true;
    micBtn.classList.add("active");

    // Connect WebSocket if not already
    if (!ws || ws.readyState !== WebSocket.OPEN) {
      connectWS();
    }
  } catch (err) {
    console.error("Mic access failed:", err);
    alert("Could not access microphone. Please allow mic permissions.");
  }
}

function stopMic() {
  if (workletNode) {
    workletNode.disconnect();
    if (workletNode._micCtx) {
      workletNode._micCtx.close();
    }
    workletNode = null;
  }
  if (micStream) {
    micStream.getTracks().forEach((t) => t.stop());
    micStream = null;
  }
  micActive = false;
  micBtn.classList.remove("active");
}

// --- Voice Selection ---

async function applyVoice() {
  const voice = document.getElementById("voice-select").value;
  const statusEl = document.getElementById("voice-status");

  const formData = new FormData();
  formData.append("voice", voice);
  formData.append("lang", "a");

  statusEl.textContent = "Applying...";
  try {
    const resp = await fetch("/api/set-voice", {
      method: "POST",
      body: formData,
    });
    const data = await resp.json();
    if (data.status === "ok") {
      statusEl.textContent = "Voice: " + voice;
    } else {
      statusEl.textContent = "Failed.";
    }
  } catch (err) {
    statusEl.textContent = "Error: " + err.message;
  }
}

// Expose to HTML onclick
window.toggleMic = toggleMic;
window.applyVoice = applyVoice;