initial commit

This commit is contained in:
2026-04-07 03:58:35 -04:00
commit ce41bca422
17 changed files with 1184 additions and 0 deletions
+305
View File
@@ -0,0 +1,305 @@
// --- State ---
let ws = null;
let audioCtx = null;
let micStream = null;
let workletNode = null;
let micActive = false;
let nextPlayTime = 0;
let isPlaying = false;
const PLAYBACK_SR = 24000; // TTS output sample rate
const MIC_SR = 16000;
const BARGE_IN_THRESHOLD = 0.03; // RMS energy threshold for barge-in
const BARGE_IN_FRAMES = 2; // Consecutive frames above threshold to trigger
let bargeInCount = 0;
const chatArea = document.getElementById("chat-area");
const statusBadge = document.getElementById("status-badge");
const micBtn = document.getElementById("mic-btn");
// --- WebSocket ---
function connectWS() {
const proto = location.protocol === "https:" ? "wss:" : "ws:";
ws = new WebSocket(`${proto}//${location.host}/ws/chat`);
ws.binaryType = "arraybuffer";
ws.onopen = () => {
setStatus("listening");
};
ws.onclose = () => {
setStatus("disconnected");
setTimeout(connectWS, 2000);
};
ws.onerror = () => {
ws.close();
};
ws.onmessage = (event) => {
if (event.data instanceof ArrayBuffer) {
playAudioChunk(event.data);
} else {
handleJSON(JSON.parse(event.data));
}
};
}
function handleJSON(msg) {
switch (msg.type) {
case "status":
setStatus(msg.state);
break;
case "interrupt":
stopPlayback();
// Trim the assistant message to what was spoken, then finalize
finalizeAssistantMessage();
break;
case "transcript":
addMessage("user", msg.text);
break;
case "response_text":
if (msg.final) {
finalizeAssistantMessage();
} else {
appendAssistantText(msg.text);
}
break;
}
}
// --- Status ---
function setStatus(state) {
statusBadge.textContent =
state === "listening"
? "Listening"
: state === "thinking"
? "Thinking..."
: state === "speaking"
? "Speaking"
: state === "disconnected"
? "Disconnected"
: state;
statusBadge.className = state;
}
// --- Chat Messages ---
let currentAssistantEl = null;
let currentAssistantText = "";
function addMessage(role, text) {
const el = document.createElement("div");
el.className = `message ${role}`;
el.textContent = text;
chatArea.appendChild(el);
chatArea.scrollTop = chatArea.scrollHeight;
}
function appendAssistantText(text) {
if (!currentAssistantEl) {
currentAssistantEl = document.createElement("div");
currentAssistantEl.className = "message assistant";
chatArea.appendChild(currentAssistantEl);
currentAssistantText = "";
}
currentAssistantText += (currentAssistantText ? " " : "") + text;
currentAssistantEl.textContent = currentAssistantText;
chatArea.scrollTop = chatArea.scrollHeight;
}
function finalizeAssistantMessage() {
currentAssistantEl = null;
currentAssistantText = "";
}
// --- Audio Playback ---
let activeSources = [];
function getPlaybackCtx() {
if (!audioCtx || audioCtx.state === "closed") {
audioCtx = new AudioContext({ sampleRate: PLAYBACK_SR });
}
return audioCtx;
}
function playAudioChunk(arrayBuffer) {
const ctx = getPlaybackCtx();
const int16 = new Int16Array(arrayBuffer);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
const buffer = ctx.createBuffer(1, float32.length, PLAYBACK_SR);
buffer.getChannelData(0).set(float32);
const source = ctx.createBufferSource();
source.buffer = buffer;
source.connect(ctx.destination);
activeSources.push(source);
isPlaying = true;
source.onended = () => {
activeSources = activeSources.filter((s) => s !== source);
if (activeSources.length === 0) {
isPlaying = false;
bargeInCount = 0;
}
};
const now = ctx.currentTime;
if (nextPlayTime < now) {
nextPlayTime = now + 0.01;
}
source.start(nextPlayTime);
nextPlayTime += buffer.duration;
}
function stopPlayback() {
for (const source of activeSources) {
try {
source.stop();
} catch (_) {}
}
activeSources = [];
nextPlayTime = 0;
isPlaying = false;
bargeInCount = 0;
}
// --- Microphone ---
async function toggleMic() {
if (micActive) {
stopMic();
} else {
await startMic();
}
}
async function startMic() {
try {
// Ensure playback context exists (needed for user gesture)
getPlaybackCtx();
if (audioCtx.state === "suspended") {
await audioCtx.resume();
}
micStream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: MIC_SR,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
},
});
// Create a separate context at 16kHz for mic capture
const micCtx = new AudioContext({ sampleRate: MIC_SR });
const source = micCtx.createMediaStreamSource(micStream);
await micCtx.audioWorklet.addModule("/static/processor.js");
workletNode = new AudioWorkletNode(micCtx, "pcm-processor");
source.connect(workletNode);
workletNode.port.onmessage = (e) => {
if (ws && ws.readyState === WebSocket.OPEN) {
ws.send(e.data);
// Client-side barge-in: detect mic energy while playing
if (isPlaying) {
const samples = new Int16Array(e.data);
let sum = 0;
for (let i = 0; i < samples.length; i++) {
const s = samples[i] / 32768;
sum += s * s;
}
const rms = Math.sqrt(sum / samples.length);
if (rms > BARGE_IN_THRESHOLD) {
bargeInCount++;
if (bargeInCount >= BARGE_IN_FRAMES) {
// User is speaking over the assistant - interrupt
stopPlayback();
finalizeAssistantMessage();
ws.send(JSON.stringify({ type: "interrupt" }));
isPlaying = false;
bargeInCount = 0;
}
} else {
bargeInCount = 0;
}
}
}
};
// Store for cleanup
workletNode._micCtx = micCtx;
micActive = true;
micBtn.classList.add("active");
// Connect WebSocket if not already
if (!ws || ws.readyState !== WebSocket.OPEN) {
connectWS();
}
} catch (err) {
console.error("Mic access failed:", err);
alert("Could not access microphone. Please allow mic permissions.");
}
}
function stopMic() {
if (workletNode) {
workletNode.disconnect();
if (workletNode._micCtx) {
workletNode._micCtx.close();
}
workletNode = null;
}
if (micStream) {
micStream.getTracks().forEach((t) => t.stop());
micStream = null;
}
micActive = false;
micBtn.classList.remove("active");
}
// --- Voice Selection ---
async function applyVoice() {
const voice = document.getElementById("voice-select").value;
const statusEl = document.getElementById("voice-status");
const formData = new FormData();
formData.append("voice", voice);
formData.append("lang", "a");
statusEl.textContent = "Applying...";
try {
const resp = await fetch("/api/set-voice", {
method: "POST",
body: formData,
});
const data = await resp.json();
if (data.status === "ok") {
statusEl.textContent = "Voice: " + voice;
} else {
statusEl.textContent = "Failed.";
}
} catch (err) {
statusEl.textContent = "Error: " + err.message;
}
}
// Expose to HTML onclick
window.toggleMic = toggleMic;
window.applyVoice = applyVoice;
+49
View File
@@ -0,0 +1,49 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Voice Chat</title>
<link rel="stylesheet" href="/static/style.css" />
</head>
<body>
<header>
<h1>Voice Chat</h1>
<span id="status-badge">Disconnected</span>
</header>
<div id="chat-area"></div>
<details id="voice-panel">
<summary>Voice Settings</summary>
<div class="panel-content">
<label>
Voice
<select id="voice-select">
<optgroup label="Female">
<option value="af_heart" selected>Heart</option>
<option value="af_nicole">Nicole</option>
<option value="af_bella">Bella</option>
<option value="af_sarah">Sarah</option>
<option value="af_nova">Nova</option>
<option value="af_jessica">Jessica</option>
<option value="af_river">River</option>
</optgroup>
<optgroup label="Male">
<option value="am_adam">Adam</option>
<option value="am_michael">Michael</option>
</optgroup>
</select>
</label>
<button id="apply-voice-btn" onclick="applyVoice()">Apply</button>
<span id="voice-status"></span>
</div>
</details>
<div id="controls">
<button id="mic-btn" onclick="toggleMic()">&#x1F3A4;</button>
</div>
<script src="/static/app.js"></script>
</body>
</html>
+42
View File
@@ -0,0 +1,42 @@
/**
* AudioWorkletProcessor that collects 512-sample chunks of PCM audio
* and posts them to the main thread for WebSocket transmission.
*/
class PCMProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.buffer = new Float32Array(0);
this.chunkSize = 512; // 512 samples at 16kHz = 32ms
}
process(inputs) {
const input = inputs[0];
if (!input || !input[0]) return true;
const channelData = input[0]; // mono
// Append to buffer
const newBuffer = new Float32Array(this.buffer.length + channelData.length);
newBuffer.set(this.buffer);
newBuffer.set(channelData, this.buffer.length);
this.buffer = newBuffer;
// Send complete chunks
while (this.buffer.length >= this.chunkSize) {
const chunk = this.buffer.slice(0, this.chunkSize);
this.buffer = this.buffer.slice(this.chunkSize);
// Convert float32 to int16 for transmission
const int16 = new Int16Array(chunk.length);
for (let i = 0; i < chunk.length; i++) {
const s = Math.max(-1, Math.min(1, chunk[i]));
int16[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
}
this.port.postMessage(int16.buffer, [int16.buffer]);
}
return true;
}
}
registerProcessor("pcm-processor", PCMProcessor);
+185
View File
@@ -0,0 +1,185 @@
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background: #0f0f0f;
color: #e0e0e0;
height: 100vh;
display: flex;
flex-direction: column;
}
header {
padding: 16px 24px;
border-bottom: 1px solid #222;
display: flex;
align-items: center;
justify-content: space-between;
}
header h1 {
font-size: 18px;
font-weight: 600;
color: #fff;
}
#status-badge {
padding: 4px 12px;
border-radius: 12px;
font-size: 13px;
font-weight: 500;
background: #1a1a2e;
color: #888;
transition: all 0.3s;
}
#status-badge.listening {
background: #0a2a1a;
color: #4ade80;
}
#status-badge.thinking {
background: #2a1a0a;
color: #fbbf24;
}
#status-badge.speaking {
background: #1a0a2a;
color: #a78bfa;
}
#chat-area {
flex: 1;
overflow-y: auto;
padding: 24px;
display: flex;
flex-direction: column;
gap: 12px;
}
.message {
max-width: 70%;
padding: 10px 16px;
border-radius: 16px;
font-size: 15px;
line-height: 1.5;
word-wrap: break-word;
}
.message.user {
align-self: flex-end;
background: #1d4ed8;
color: #fff;
border-bottom-right-radius: 4px;
}
.message.assistant {
align-self: flex-start;
background: #1e1e1e;
color: #e0e0e0;
border-bottom-left-radius: 4px;
}
#controls {
padding: 16px 24px;
border-top: 1px solid #222;
display: flex;
align-items: center;
gap: 16px;
}
#mic-btn {
width: 56px;
height: 56px;
border-radius: 50%;
border: 2px solid #333;
background: #1a1a1a;
color: #e0e0e0;
font-size: 24px;
cursor: pointer;
transition: all 0.2s;
display: flex;
align-items: center;
justify-content: center;
}
#mic-btn:hover {
border-color: #555;
background: #222;
}
#mic-btn.active {
border-color: #ef4444;
background: #2a0a0a;
color: #ef4444;
animation: pulse 1.5s infinite;
}
@keyframes pulse {
0%, 100% { box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.3); }
50% { box-shadow: 0 0 0 12px rgba(239, 68, 68, 0); }
}
/* Voice clone panel */
#voice-panel {
padding: 12px 24px;
border-top: 1px solid #222;
background: #0a0a0a;
}
#voice-panel summary {
cursor: pointer;
font-size: 13px;
color: #888;
user-select: none;
}
#voice-panel .panel-content {
margin-top: 12px;
display: flex;
gap: 12px;
align-items: flex-end;
flex-wrap: wrap;
}
#voice-panel label {
font-size: 13px;
color: #aaa;
display: flex;
flex-direction: column;
gap: 4px;
}
#voice-panel input[type="file"],
#voice-panel input[type="text"] {
background: #1a1a1a;
border: 1px solid #333;
border-radius: 6px;
padding: 6px 10px;
color: #e0e0e0;
font-size: 13px;
}
#upload-btn {
padding: 6px 16px;
border-radius: 6px;
border: 1px solid #333;
background: #1a1a1a;
color: #e0e0e0;
font-size: 13px;
cursor: pointer;
}
#upload-btn:hover {
background: #222;
}
#upload-status {
font-size: 12px;
color: #888;
margin-left: 8px;
}