Filter cached vision action windows
This commit is contained in:
@@ -908,7 +908,7 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
||||
Confirmed and provisional matches both stay subject to the same thresholds
|
||||
used elsewhere; this only adds matches that pass the same quality gates.
|
||||
"""
|
||||
if not cfg.vision.enabled or not beats:
|
||||
if not beats:
|
||||
return results
|
||||
|
||||
from dataclasses import replace
|
||||
@@ -977,6 +977,79 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
||||
|
||||
scenes_by_id = {s.scene_id: s for s in scenes}
|
||||
best = None # (score, scene, in_s, dur_s, reason)
|
||||
try:
|
||||
from src.llm.vision_cache import (
|
||||
_load_cache,
|
||||
_semantic_action_groups,
|
||||
_semantic_match_score,
|
||||
_STRONG_ACTION_GROUPS,
|
||||
)
|
||||
cache = _load_cache(cfg)
|
||||
items = cache.get("items", {})
|
||||
beat_desc = ""
|
||||
if isinstance(items, dict):
|
||||
for item in items.values():
|
||||
if (
|
||||
isinstance(item, dict)
|
||||
and item.get("kind") == "beat"
|
||||
and item.get("item_id") == beat.beat_id
|
||||
):
|
||||
beat_desc = str(item.get("description", ""))
|
||||
break
|
||||
beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS if beat_desc else set()
|
||||
identity_vocab = {
|
||||
"woman", "women", "man", "men", "girl", "boy", "child",
|
||||
"blonde", "hair", "face", "mouth", "eyes", "profile",
|
||||
"close-up", "closeup",
|
||||
}
|
||||
beat_identity = {term for term in identity_vocab if term in beat_desc.lower()}
|
||||
distinctive_identity = {
|
||||
term for term in ("woman", "women", "blonde", "mouth", "face")
|
||||
if term in beat_desc.lower()
|
||||
}
|
||||
if beat_actions and isinstance(items, dict):
|
||||
for item in items.values():
|
||||
if not isinstance(item, dict) or item.get("kind") != "action_window":
|
||||
continue
|
||||
scene = scenes_by_id.get(item.get("item_id"))
|
||||
desc = str(item.get("description", ""))
|
||||
source_actions = _semantic_action_groups(desc)
|
||||
if scene is None or not beat_actions <= source_actions:
|
||||
continue
|
||||
source_text = desc.lower()
|
||||
positive_source_text = source_text.split('"negatives"', 1)[0]
|
||||
identity_overlap = {term for term in beat_identity if term in source_text}
|
||||
if len(beat_identity) >= 2 and len(identity_overlap) < 2:
|
||||
continue
|
||||
if distinctive_identity and not any(term in positive_source_text for term in distinctive_identity):
|
||||
continue
|
||||
if "mouth" in beat_desc.lower() and "mouth" not in positive_source_text:
|
||||
continue
|
||||
if "dark interior" in beat_desc.lower() and (
|
||||
"interior" not in positive_source_text or "dark" not in positive_source_text
|
||||
):
|
||||
continue
|
||||
score, reason = _semantic_match_score(beat_desc, desc)
|
||||
if score < max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
|
||||
continue
|
||||
try:
|
||||
in_s = float(item.get("start_s"))
|
||||
out_s = float(item.get("end_s"))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
duration_s = max(0.32, min(anchor_beat.duration_s, out_s - in_s))
|
||||
candidate = (
|
||||
min(0.99, score),
|
||||
scene,
|
||||
in_s,
|
||||
duration_s,
|
||||
f"cached vision action; {reason}",
|
||||
)
|
||||
if best is None or candidate[0] > best[0]:
|
||||
best = candidate
|
||||
except Exception as exc:
|
||||
logger.debug("Beat %d: cached vision fallback failed (%s)", beat.beat_id, exc)
|
||||
|
||||
seen = set()
|
||||
for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]:
|
||||
scene = scenes_by_id.get(hit.scene_id)
|
||||
@@ -1003,7 +1076,10 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc)
|
||||
continue
|
||||
aligned_in_s = start_s
|
||||
combined_score = semantic_score
|
||||
content_score = 0.0
|
||||
motion_score = 0.0
|
||||
aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s)))
|
||||
|
||||
try:
|
||||
@@ -1033,6 +1109,8 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
|
||||
combined_score,
|
||||
min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
|
||||
)
|
||||
if semantic_score >= max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
|
||||
final_score = max(final_score, semantic_score)
|
||||
if final_score < cfg.cv.deep_scan.provisional_match_threshold:
|
||||
continue
|
||||
candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}")
|
||||
|
||||
Reference in New Issue
Block a user