Files
live-voice-chat/server/llm.py
T
2026-04-08 10:17:20 -04:00

147 lines
4.7 KiB
Python

import logging
import threading
from typing import AsyncIterator
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from server.audio_utils import split_sentences
log = logging.getLogger(__name__)
class LLMEngine:
"""Wraps Qwen3 for conversation generation."""
SYSTEM_PROMPT = (
"You are a helpful voice assistant. Keep your responses concise and natural "
"for spoken conversation. Respond in 1-3 short sentences. "
"Do not use markdown, bullet points, code blocks, emojis, or any "
"formatting that doesn't work in speech."
)
def __init__(self, model: AutoModelForCausalLM, tokenizer: AutoTokenizer):
self.model = model
self.tokenizer = tokenizer
def _build_inputs(self, messages: list[dict]):
"""Build input token ids using the model's chat template."""
chat_messages = [{"role": "system", "content": self.SYSTEM_PROMPT}]
for msg in messages:
chat_messages.append({"role": msg["role"], "content": msg["content"]})
text = self.tokenizer.apply_chat_template(
chat_messages,
tokenize=False,
add_generation_prompt=True,
enable_thinking=False,
)
return self.tokenizer(text, return_tensors="pt").to(self.model.device)
def generate(self, messages: list[dict], max_new_tokens: int = 256) -> str:
"""Generate a complete response (blocking)."""
inputs = self._build_inputs(messages)
input_len = inputs["input_ids"].shape[1]
with torch.no_grad():
output_ids = self.model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.9,
do_sample=True,
repetition_penalty=1.2,
)
# Decode only the generated tokens (skip prompt)
new_ids = output_ids[0][input_len:]
response = self.tokenizer.decode(new_ids, skip_special_tokens=True).strip()
log.info(f"LLM response: {response}")
return response
async def generate_sentences(
self,
messages: list[dict],
cancel_event: threading.Event | None = None,
) -> AsyncIterator[str]:
"""Generate response and yield it sentence by sentence for TTS pipelining."""
import asyncio
response = await asyncio.to_thread(self.generate, messages)
if cancel_event and cancel_event.is_set():
return
# Split into sentences and yield each
sentences, remainder = split_sentences(response)
for sentence in sentences:
if cancel_event and cancel_event.is_set():
return
yield sentence
if remainder:
yield remainder
SYSTEM_PROMPT = (
"You are a helpful voice assistant. Keep your responses concise and natural "
"for spoken conversation. Respond in 1-3 short sentences. "
"Do not use markdown, bullet points, code blocks, emojis, or any "
"formatting that doesn't work in speech."
)
class LMStudioEngine:
"""LLM engine that delegates to an LM Studio server via its OpenAI-compatible API."""
def __init__(self, base_url: str, model: str):
self.base_url = base_url.rstrip("/")
self.model = model
def generate(self, messages: list[dict], max_new_tokens: int = 256) -> str:
import requests
payload_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
payload_messages.extend(messages)
body: dict = {
"messages": payload_messages,
"max_tokens": max_new_tokens,
"temperature": 0.7,
"stream": False,
}
if self.model:
body["model"] = self.model
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json=body,
timeout=30,
)
resp.raise_for_status()
response = resp.json()["choices"][0]["message"]["content"].strip()
log.info(f"LM Studio response: {response}")
return response
async def generate_sentences(
self,
messages: list[dict],
cancel_event: threading.Event | None = None,
) -> AsyncIterator[str]:
"""Generate response and yield it sentence by sentence for TTS pipelining."""
import asyncio
response = await asyncio.to_thread(self.generate, messages)
if cancel_event and cancel_event.is_set():
return
sentences, remainder = split_sentences(response)
for sentence in sentences:
if cancel_event and cancel_event.is_set():
return
yield sentence
if remainder:
yield remainder