add KV-cache and move system promt to the config

2026-04-08 10:25:03 -04:00
parent c7c4019ecc
commit 175ed943df
4 changed files with 127 additions and 47 deletions
@@ -6,6 +6,7 @@ import threading
 import numpy as np

 from server.audio_utils import float32_to_pcm_bytes
+from server.llm import KVCacheState
 from server.models import ModelManager
 from server.vad import StreamingVAD

@@ -27,6 +28,7 @@ class ConversationSession:

        self.vad: StreamingVAD = models.create_vad()
        self.conversation_history: list[dict] = []
+        self.kv_cache_state: KVCacheState | None = None
        self.cancel_event = threading.Event()
        self.is_responding = False
        self._response_task: asyncio.Task | None = None
@@ -38,6 +40,7 @@ class ConversationSession:
        self.cancel_event.set()
        if self._response_task and not self._response_task.done():
            self._response_task.cancel()
+        self.kv_cache_state = None

    async def handle_audio_chunk(self, chunk_16k: np.ndarray):
        utterance = self.vad.process_chunk(chunk_16k)
@@ -91,8 +94,8 @@ class ConversationSession:
        # LLM
        log.info(f"Conversation history ({len(self.conversation_history)} messages): "
                 + str([m['content'][:50] for m in self.conversation_history]))
-        response = await asyncio.to_thread(
-            self.models.llm_engine.generate, self.conversation_history
+        response, self.kv_cache_state = await asyncio.to_thread(
+            self.models.llm_engine.generate, self.conversation_history, 256, self.kv_cache_state
        )

        if self.cancel_event.is_set():
@@ -147,12 +150,18 @@ class ConversationSession:
        tts_thread.join(timeout=2.0)

        # Save only what was actually spoken
+        was_interrupted = spoken_text.strip() != response.strip()
        if spoken_text.strip():
            self.conversation_history.append(
                {"role": "assistant", "content": spoken_text.strip()}
            )
+            if was_interrupted and self.kv_cache_state is not None:
+                self.kv_cache_state = self.models.llm_engine.trim_cache(
+                    self.kv_cache_state, self.conversation_history
+                )
        elif self.conversation_history and self.conversation_history[-1]["role"] == "user":
            self.conversation_history.pop()
+            self.kv_cache_state = None

        if not self.cancel_event.is_set():
            await self.send_json({"type": "response_text", "text": "", "final": True})