add KV-cache and move system promt to the config

This commit is contained in:
2026-04-08 10:25:03 -04:00
parent c7c4019ecc
commit 175ed943df
4 changed files with 127 additions and 47 deletions
+7 -4
View File
@@ -68,16 +68,18 @@ class ModelManager:
def _load_llm(self):
from server.config import config
backend = config.get("llm", {}).get("backend", "local")
llm_config = config.get("llm", {})
backend = llm_config.get("backend", "local")
system_prompt = llm_config.get("system_prompt", "You are a helpful assistant.")
if backend == "lmstudio":
from server.llm import LMStudioEngine
lms = config.get("llm", {}).get("lmstudio", {})
lms = llm_config.get("lmstudio", {})
url = lms.get("url", "http://host.docker.internal:1234")
model = lms.get("model", "") or ""
log.info(f"Using LM Studio backend at {url} (model={model or 'server default'})")
self.llm_engine = LMStudioEngine(url, model)
self.llm_engine = LMStudioEngine(url, model, system_prompt)
else:
log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
from transformers import AutoModelForCausalLM, AutoTokenizer
@@ -90,7 +92,8 @@ class ModelManager:
model_name,
device_map=device,
)
self.llm_engine = LLMEngine(model, tokenizer)
max_cache_tokens = llm_config.get("max_cache_tokens", 4096)
self.llm_engine = LLMEngine(model, tokenizer, system_prompt, max_cache_tokens)
log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")
def _load_tts(self):