Enhance video handling and performance optimizations

- Added environment variables to prevent CPU thread pools from busy-waiting.
- Deferred loading of video models until first use to reduce VRAM footprint.
- Implemented streaming of speaking clips for improved responsiveness.
- Introduced a queue for managing speaking clips to handle multiple requests smoothly.
- Updated video playback logic to ensure proper handling of clip generation.
This commit is contained in:
2026-04-24 00:36:18 -04:00
parent 129df7d1fa
commit 44a10667c2
7 changed files with 234 additions and 69 deletions
+29 -3
View File
@@ -80,11 +80,28 @@ class LLMEngine:
f"processing {input_len - cached_len} new tokens"
)
with torch.no_grad():
outputs = self.model.generate(
# Guard: if the cache claims to have seen >= input tokens, it's
# stale (can happen after barge-in races or tokenizer mismatches).
# An invalid cache causes an empty cache_position in transformers,
# which raises IndexError inside model.generate().
if past_kv is not None:
cache_seq_len = (
past_kv.get_seq_length()
if hasattr(past_kv, "get_seq_length")
else cached_len
)
if cache_seq_len >= input_len:
log.warning(
f"KV-cache stale (cache_seq={cache_seq_len} >= input={input_len}), discarding."
)
past_kv = None
cached_len = 0
def _do_generate(pkv):
return self.model.generate(
input_ids=input_ids,
attention_mask=inputs.get("attention_mask"),
past_key_values=past_kv,
past_key_values=pkv,
max_new_tokens=max_new_tokens,
temperature=0.7,
top_p=0.9,
@@ -94,6 +111,15 @@ class LLMEngine:
use_cache=True,
)
with torch.no_grad():
try:
outputs = _do_generate(past_kv)
except IndexError:
log.warning("KV-cache caused IndexError during generate; retrying without cache.")
past_kv = None
cached_len = 0
outputs = _do_generate(None)
# Decode only the generated tokens (skip prompt)
new_ids = outputs.sequences[0][input_len:]
response = self.tokenizer.decode(new_ids, skip_special_tokens=True).strip()