Files
live-voice-chat/server/audio_utils.py
T
2026-04-07 03:58:35 -04:00

64 lines
2.0 KiB
Python

import numpy as np
from scipy.signal import resample_poly
from math import gcd
def pcm_bytes_to_float32(pcm_bytes: bytes, dtype=np.int16) -> np.ndarray:
"""Convert raw PCM bytes (16-bit signed int) to float32 in [-1, 1]."""
audio = np.frombuffer(pcm_bytes, dtype=dtype)
return audio.astype(np.float32) / 32768.0
def float32_to_pcm_bytes(audio) -> bytes:
"""Convert float32 audio in [-1, 1] to 16-bit PCM bytes.
Accepts numpy arrays or PyTorch tensors.
"""
if not isinstance(audio, np.ndarray):
audio = audio.detach().cpu().numpy()
clamped = np.clip(audio, -1.0, 1.0)
return (clamped * 32767).astype(np.int16).tobytes()
def resample(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
"""Resample audio from orig_sr to target_sr using polyphase filtering."""
if orig_sr == target_sr:
return audio
divisor = gcd(orig_sr, target_sr)
up = target_sr // divisor
down = orig_sr // divisor
return resample_poly(audio, up, down).astype(audio.dtype)
def split_sentences(text: str) -> tuple[list[str], str]:
"""Split text into completed sentences and a remaining buffer.
Returns (sentences, remaining_buffer).
Splits on sentence-ending punctuation followed by whitespace.
"""
sentences = []
buffer = text
terminators = ".!?"
i = 0
start = 0
while i < len(buffer):
if buffer[i] in terminators:
# Look ahead for whitespace or end of string
end = i + 1
while end < len(buffer) and buffer[end] in terminators:
end += 1
if end >= len(buffer) or buffer[end] == " " or buffer[end] == "\n":
sentence = buffer[start:end].strip()
if sentence:
sentences.append(sentence)
start = end
i = end
else:
i += 1
else:
i += 1
remaining = buffer[start:].strip()
return sentences, remaining