add KV-cache and move system promt to the config
This commit is contained in:
+7
-4
@@ -68,16 +68,18 @@ class ModelManager:
|
||||
def _load_llm(self):
|
||||
from server.config import config
|
||||
|
||||
backend = config.get("llm", {}).get("backend", "local")
|
||||
llm_config = config.get("llm", {})
|
||||
backend = llm_config.get("backend", "local")
|
||||
system_prompt = llm_config.get("system_prompt", "You are a helpful assistant.")
|
||||
|
||||
if backend == "lmstudio":
|
||||
from server.llm import LMStudioEngine
|
||||
|
||||
lms = config.get("llm", {}).get("lmstudio", {})
|
||||
lms = llm_config.get("lmstudio", {})
|
||||
url = lms.get("url", "http://host.docker.internal:1234")
|
||||
model = lms.get("model", "") or ""
|
||||
log.info(f"Using LM Studio backend at {url} (model={model or 'server default'})")
|
||||
self.llm_engine = LMStudioEngine(url, model)
|
||||
self.llm_engine = LMStudioEngine(url, model, system_prompt)
|
||||
else:
|
||||
log.info("Loading Qwen3-4B (GPTQ 4-bit)...")
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
@@ -90,7 +92,8 @@ class ModelManager:
|
||||
model_name,
|
||||
device_map=device,
|
||||
)
|
||||
self.llm_engine = LLMEngine(model, tokenizer)
|
||||
max_cache_tokens = llm_config.get("max_cache_tokens", 4096)
|
||||
self.llm_engine = LLMEngine(model, tokenizer, system_prompt, max_cache_tokens)
|
||||
log.info("Qwen3-4B-GPTQ-Int4 loaded (~2.5GB VRAM).")
|
||||
|
||||
def _load_tts(self):
|
||||
|
||||
Reference in New Issue
Block a user