Enhance video handling and performance optimizations
- Added environment variables to prevent CPU thread pools from busy-waiting. - Deferred loading of video models until first use to reduce VRAM footprint. - Implemented streaming of speaking clips for improved responsiveness. - Introduced a queue for managing speaking clips to handle multiple requests smoothly. - Updated video playback logic to ensure proper handling of clip generation.
This commit is contained in:
+29
-3
@@ -80,11 +80,28 @@ class LLMEngine:
|
||||
f"processing {input_len - cached_len} new tokens"
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = self.model.generate(
|
||||
# Guard: if the cache claims to have seen >= input tokens, it's
|
||||
# stale (can happen after barge-in races or tokenizer mismatches).
|
||||
# An invalid cache causes an empty cache_position in transformers,
|
||||
# which raises IndexError inside model.generate().
|
||||
if past_kv is not None:
|
||||
cache_seq_len = (
|
||||
past_kv.get_seq_length()
|
||||
if hasattr(past_kv, "get_seq_length")
|
||||
else cached_len
|
||||
)
|
||||
if cache_seq_len >= input_len:
|
||||
log.warning(
|
||||
f"KV-cache stale (cache_seq={cache_seq_len} >= input={input_len}), discarding."
|
||||
)
|
||||
past_kv = None
|
||||
cached_len = 0
|
||||
|
||||
def _do_generate(pkv):
|
||||
return self.model.generate(
|
||||
input_ids=input_ids,
|
||||
attention_mask=inputs.get("attention_mask"),
|
||||
past_key_values=past_kv,
|
||||
past_key_values=pkv,
|
||||
max_new_tokens=max_new_tokens,
|
||||
temperature=0.7,
|
||||
top_p=0.9,
|
||||
@@ -94,6 +111,15 @@ class LLMEngine:
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
with torch.no_grad():
|
||||
try:
|
||||
outputs = _do_generate(past_kv)
|
||||
except IndexError:
|
||||
log.warning("KV-cache caused IndexError during generate; retrying without cache.")
|
||||
past_kv = None
|
||||
cached_len = 0
|
||||
outputs = _do_generate(None)
|
||||
|
||||
# Decode only the generated tokens (skip prompt)
|
||||
new_ids = outputs.sequences[0][input_len:]
|
||||
response = self.tokenizer.decode(new_ids, skip_special_tokens=True).strip()
|
||||
|
||||
Reference in New Issue
Block a user