add lm studio option

This commit is contained in:
2026-04-08 10:17:20 -04:00
parent 08c5757b31
commit c7c4019ecc
6 changed files with 112 additions and 13 deletions
+8
View File
@@ -0,0 +1,8 @@
# LLM backend: "local" or "lmstudio"
llm:
backend: local # change to "lmstudio" to use LM Studio instead
# Settings used only when backend = "lmstudio"
lmstudio:
url: http://host.docker.internal:1234 # host.docker.internal resolves to your PC from inside Docker
model: "" # leave empty to use whatever model LM Studio has loaded
+2
View File
@@ -6,6 +6,8 @@ services:
volumes: volumes:
# Cache models on the host so they survive container rebuilds # Cache models on the host so they survive container rebuilds
- huggingface-cache:/cache/huggingface - huggingface-cache:/cache/huggingface
# Mount config so you can edit backend settings without rebuilding the image
- ./config.yml:/app/config.yml:ro
deploy: deploy:
resources: resources:
reservations: reservations:
+1
View File
@@ -13,3 +13,4 @@ numpy
soundfile soundfile
scipy scipy
python-multipart python-multipart
pyyaml
+12
View File
@@ -0,0 +1,12 @@
import pathlib
import yaml
_CONFIG_PATH = pathlib.Path(__file__).parent.parent / "config.yml"
def load_config() -> dict:
with open(_CONFIG_PATH) as f:
return yaml.safe_load(f)
config = load_config()
+63
View File
@@ -81,3 +81,66 @@ class LLMEngine:
if remainder: if remainder:
yield remainder yield remainder
SYSTEM_PROMPT = (
"You are a helpful voice assistant. Keep your responses concise and natural "
"for spoken conversation. Respond in 1-3 short sentences. "
"Do not use markdown, bullet points, code blocks, emojis, or any "
"formatting that doesn't work in speech."
)
class LMStudioEngine:
"""LLM engine that delegates to an LM Studio server via its OpenAI-compatible API."""
def __init__(self, base_url: str, model: str):
self.base_url = base_url.rstrip("/")
self.model = model
def generate(self, messages: list[dict], max_new_tokens: int = 256) -> str:
import requests
payload_messages = [{"role": "system", "content": SYSTEM_PROMPT}]
payload_messages.extend(messages)
body: dict = {
"messages": payload_messages,
"max_tokens": max_new_tokens,
"temperature": 0.7,
"stream": False,
}
if self.model:
body["model"] = self.model
resp = requests.post(
f"{self.base_url}/v1/chat/completions",
json=body,
timeout=30,
)
resp.raise_for_status()
response = resp.json()["choices"][0]["message"]["content"].strip()
log.info(f"LM Studio response: {response}")
return response
async def generate_sentences(
self,
messages: list[dict],
cancel_event: threading.Event | None = None,
) -> AsyncIterator[str]:
"""Generate response and yield it sentence by sentence for TTS pipelining."""
import asyncio
response = await asyncio.to_thread(self.generate, messages)
if cancel_event and cancel_event.is_set():
return
sentences, remainder = split_sentences(response)
for sentence in sentences:
if cancel_event and cancel_event.is_set():
return
yield sentence
if remainder:
yield remainder
+14 -1
View File
@@ -46,7 +46,7 @@ class ModelManager:
from server.vad import SileroVADOnnx from server.vad import SileroVADOnnx
model_path = hf_hub_download( model_path = hf_hub_download(
repo_id="onnx-community/silero-vad", filename="silero_vad.onnx" repo_id="onnx-community/silero-vad", filename="onnx/model.onnx"
) )
self.vad_model = SileroVADOnnx(model_path) self.vad_model = SileroVADOnnx(model_path)
log.info("Silero VAD loaded (ONNX, CPU).") log.info("Silero VAD loaded (ONNX, CPU).")
@@ -66,6 +66,19 @@ class ModelManager:
log.info("Qwen3-ASR-0.6B loaded.") log.info("Qwen3-ASR-0.6B loaded.")
def _load_llm(self): def _load_llm(self):
from server.config import config
backend = config.get("llm", {}).get("backend", "local")
if backend == "lmstudio":
from server.llm import LMStudioEngine
lms = config.get("llm", {}).get("lmstudio", {})
url = lms.get("url", "http://host.docker.internal:1234")
model = lms.get("model", "") or ""
log.info(f"Using LM Studio backend at {url} (model={model or 'server default'})")
self.llm_engine = LMStudioEngine(url, model)
else:
log.info("Loading Qwen3-4B (GPTQ 4-bit)...") log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import AutoModelForCausalLM, AutoTokenizer