add KV-cache and move system promt to the config
This commit is contained in:
+11
-2
@@ -6,6 +6,7 @@ import threading
|
||||
import numpy as np
|
||||
|
||||
from server.audio_utils import float32_to_pcm_bytes
|
||||
from server.llm import KVCacheState
|
||||
from server.models import ModelManager
|
||||
from server.vad import StreamingVAD
|
||||
|
||||
@@ -27,6 +28,7 @@ class ConversationSession:
|
||||
|
||||
self.vad: StreamingVAD = models.create_vad()
|
||||
self.conversation_history: list[dict] = []
|
||||
self.kv_cache_state: KVCacheState | None = None
|
||||
self.cancel_event = threading.Event()
|
||||
self.is_responding = False
|
||||
self._response_task: asyncio.Task | None = None
|
||||
@@ -38,6 +40,7 @@ class ConversationSession:
|
||||
self.cancel_event.set()
|
||||
if self._response_task and not self._response_task.done():
|
||||
self._response_task.cancel()
|
||||
self.kv_cache_state = None
|
||||
|
||||
async def handle_audio_chunk(self, chunk_16k: np.ndarray):
|
||||
utterance = self.vad.process_chunk(chunk_16k)
|
||||
@@ -91,8 +94,8 @@ class ConversationSession:
|
||||
# LLM
|
||||
log.info(f"Conversation history ({len(self.conversation_history)} messages): "
|
||||
+ str([m['content'][:50] for m in self.conversation_history]))
|
||||
response = await asyncio.to_thread(
|
||||
self.models.llm_engine.generate, self.conversation_history
|
||||
response, self.kv_cache_state = await asyncio.to_thread(
|
||||
self.models.llm_engine.generate, self.conversation_history, 256, self.kv_cache_state
|
||||
)
|
||||
|
||||
if self.cancel_event.is_set():
|
||||
@@ -147,12 +150,18 @@ class ConversationSession:
|
||||
tts_thread.join(timeout=2.0)
|
||||
|
||||
# Save only what was actually spoken
|
||||
was_interrupted = spoken_text.strip() != response.strip()
|
||||
if spoken_text.strip():
|
||||
self.conversation_history.append(
|
||||
{"role": "assistant", "content": spoken_text.strip()}
|
||||
)
|
||||
if was_interrupted and self.kv_cache_state is not None:
|
||||
self.kv_cache_state = self.models.llm_engine.trim_cache(
|
||||
self.kv_cache_state, self.conversation_history
|
||||
)
|
||||
elif self.conversation_history and self.conversation_history[-1]["role"] == "user":
|
||||
self.conversation_history.pop()
|
||||
self.kv_cache_state = None
|
||||
|
||||
if not self.cancel_event.is_set():
|
||||
await self.send_json({"type": "response_text", "text": "", "final": True})
|
||||
|
||||
Reference in New Issue
Block a user