Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
@@ -80,11 +80,28 @@ class LLMEngine:
                f"processing {input_len - cached_len} new tokens"
            )

-            with torch.no_grad():
-                outputs = self.model.generate(
+            # Guard: if the cache claims to have seen >= input tokens, it's
+            # stale (can happen after barge-in races or tokenizer mismatches).
+            # An invalid cache causes an empty cache_position in transformers,
+            # which raises IndexError inside model.generate().
+            if past_kv is not None:
+                cache_seq_len = (
+                    past_kv.get_seq_length()
+                    if hasattr(past_kv, "get_seq_length")
+                    else cached_len
+                )
+                if cache_seq_len >= input_len:
+                    log.warning(
+                        f"KV-cache stale (cache_seq={cache_seq_len} >= input={input_len}), discarding."
+                    )
+                    past_kv = None
+                    cached_len = 0
+
+            def _do_generate(pkv):
+                return self.model.generate(
                    input_ids=input_ids,
                    attention_mask=inputs.get("attention_mask"),
-                    past_key_values=past_kv,
+                    past_key_values=pkv,
                    max_new_tokens=max_new_tokens,
                    temperature=0.7,
                    top_p=0.9,
@@ -94,6 +111,15 @@ class LLMEngine:
                    use_cache=True,
                )

+            with torch.no_grad():
+                try:
+                    outputs = _do_generate(past_kv)
+                except IndexError:
+                    log.warning("KV-cache caused IndexError during generate; retrying without cache.")
+                    past_kv = None
+                    cached_len = 0
+                    outputs = _do_generate(None)
+
            # Decode only the generated tokens (skip prompt)
            new_ids = outputs.sequences[0][input_len:]
            response = self.tokenizer.decode(new_ids, skip_special_tokens=True).strip()