initial commit

2026-04-07 03:58:35 -04:00
commit ce41bca422
17 changed files with 1184 additions and 0 deletions
@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+
+
+class StreamingVAD:
+    """Wraps Silero VAD for streaming chunk-by-chunk speech detection."""
+
+    def __init__(self, model, threshold: float = 0.5, min_silence_ms: int = 400):
+        from silero_vad import VADIterator
+
+        self.iterator = VADIterator(
+            model,
+            sampling_rate=16000,
+            threshold=threshold,
+            min_silence_duration_ms=min_silence_ms,
+        )
+        self.audio_buffer: list[np.ndarray] = []
+        self.is_speaking = False
+
+    def process_chunk(self, chunk_16k: np.ndarray) -> np.ndarray | None:
+        """Feed a 512-sample chunk at 16kHz.
+
+        Returns the complete utterance as a numpy array when speech ends,
+        or None if still accumulating.
+        """
+        tensor = torch.from_numpy(chunk_16k).float()
+        speech_dict = self.iterator(tensor, return_seconds=False)
+
+        if speech_dict:
+            if "start" in speech_dict:
+                self.is_speaking = True
+                self.audio_buffer = []
+            if "end" in speech_dict:
+                self.is_speaking = False
+                if self.audio_buffer:
+                    result = np.concatenate(self.audio_buffer)
+                    self.audio_buffer = []
+                    self.iterator.reset_states()
+                    return result
+                self.iterator.reset_states()
+                return None
+
+        if self.is_speaking:
+            self.audio_buffer.append(chunk_16k.copy())
+
+        return None
+
+    def reset(self):
+        """Reset VAD state for a new conversation turn."""
+        self.audio_buffer = []
+        self.is_speaking = False
+        self.iterator.reset_states()