From e966a4c3217cdec4c0ea4693b1b09d2b07a8f7aa Mon Sep 17 00:00:00 2001
From: Melbar <tangshode@gmail.com>
Date: Sat, 9 May 2026 18:30:13 +0200
Subject: [PATCH] Filter cached vision action windows

---
 cli.py                  |  82 +++++++++++++++++++++-
 docs/ALGORITHM.md       |  26 +++++++
 src/llm/vision_cache.py | 146 ++++++++++++++++++++++++++++++++--------
 3 files changed, 224 insertions(+), 30 deletions(-)

diff --git a/cli.py b/cli.py
index 3c6e071..b6d5a3b 100644
--- a/cli.py
+++ b/cli.py
@@ -908,7 +908,7 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
     Confirmed and provisional matches both stay subject to the same thresholds
     used elsewhere; this only adds matches that pass the same quality gates.
     """
-    if not cfg.vision.enabled or not beats:
+    if not beats:
         return results
 
     from dataclasses import replace
@@ -977,6 +977,79 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
 
         scenes_by_id = {s.scene_id: s for s in scenes}
         best = None  # (score, scene, in_s, dur_s, reason)
+        try:
+            from src.llm.vision_cache import (
+                _load_cache,
+                _semantic_action_groups,
+                _semantic_match_score,
+                _STRONG_ACTION_GROUPS,
+            )
+            cache = _load_cache(cfg)
+            items = cache.get("items", {})
+            beat_desc = ""
+            if isinstance(items, dict):
+                for item in items.values():
+                    if (
+                        isinstance(item, dict)
+                        and item.get("kind") == "beat"
+                        and item.get("item_id") == beat.beat_id
+                    ):
+                        beat_desc = str(item.get("description", ""))
+                        break
+            beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS if beat_desc else set()
+            identity_vocab = {
+                "woman", "women", "man", "men", "girl", "boy", "child",
+                "blonde", "hair", "face", "mouth", "eyes", "profile",
+                "close-up", "closeup",
+            }
+            beat_identity = {term for term in identity_vocab if term in beat_desc.lower()}
+            distinctive_identity = {
+                term for term in ("woman", "women", "blonde", "mouth", "face")
+                if term in beat_desc.lower()
+            }
+            if beat_actions and isinstance(items, dict):
+                for item in items.values():
+                    if not isinstance(item, dict) or item.get("kind") != "action_window":
+                        continue
+                    scene = scenes_by_id.get(item.get("item_id"))
+                    desc = str(item.get("description", ""))
+                    source_actions = _semantic_action_groups(desc)
+                    if scene is None or not beat_actions <= source_actions:
+                        continue
+                    source_text = desc.lower()
+                    positive_source_text = source_text.split('"negatives"', 1)[0]
+                    identity_overlap = {term for term in beat_identity if term in source_text}
+                    if len(beat_identity) >= 2 and len(identity_overlap) < 2:
+                        continue
+                    if distinctive_identity and not any(term in positive_source_text for term in distinctive_identity):
+                        continue
+                    if "mouth" in beat_desc.lower() and "mouth" not in positive_source_text:
+                        continue
+                    if "dark interior" in beat_desc.lower() and (
+                        "interior" not in positive_source_text or "dark" not in positive_source_text
+                    ):
+                        continue
+                    score, reason = _semantic_match_score(beat_desc, desc)
+                    if score < max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
+                        continue
+                    try:
+                        in_s = float(item.get("start_s"))
+                        out_s = float(item.get("end_s"))
+                    except (TypeError, ValueError):
+                        continue
+                    duration_s = max(0.32, min(anchor_beat.duration_s, out_s - in_s))
+                    candidate = (
+                        min(0.99, score),
+                        scene,
+                        in_s,
+                        duration_s,
+                        f"cached vision action; {reason}",
+                    )
+                    if best is None or candidate[0] > best[0]:
+                        best = candidate
+        except Exception as exc:
+            logger.debug("Beat %d: cached vision fallback failed (%s)", beat.beat_id, exc)
+
         seen = set()
         for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]:
             scene = scenes_by_id.get(hit.scene_id)
@@ -1003,7 +1076,10 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
                 )
             except Exception as exc:
                 logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc)
-                continue
+                aligned_in_s = start_s
+                combined_score = semantic_score
+                content_score = 0.0
+                motion_score = 0.0
             aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s)))
 
             try:
@@ -1033,6 +1109,8 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list
                 combined_score,
                 min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
             )
+            if semantic_score >= max(0.60, cfg.cv.deep_scan.provisional_match_threshold):
+                final_score = max(final_score, semantic_score)
             if final_score < cfg.cv.deep_scan.provisional_match_threshold:
                 continue
             candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}")
diff --git a/docs/ALGORITHM.md b/docs/ALGORITHM.md
index 5f6c0e6..69555e1 100644
--- a/docs/ALGORITHM.md
+++ b/docs/ALGORITHM.md
@@ -194,6 +194,32 @@ Die Vision-Recovery läuft nicht nur für komplett fehlende Beats, sondern auch
 für schwache unbestätigte Treffer. Gerade Low-Light-Beats dürfen nicht an einem
 falschen dunklen CV-Treffer hängen bleiben, wenn der Cache semantisch eine
 bessere Handlungsphase kennt.
+Bei langen Source-Szenen prüft die Action-Window-Suche immer den Szenenanfang
+und mehrere frühe Fenster, bevor sie gleichmäßig über die ganze Szene sampelt.
+Damit gehen kurze Trailer-Aktionen am Anfang einer langen Szene nicht unter,
+wenn der Rest der Szene aus Credits, Schwarzbild oder ruhigen Folgeframes
+besteht.
+Wenn ein Action-Window die starke Beat-Aktion explizit enthält, darf es eine
+etwas niedrigere Textähnlichkeit haben; die Handlung zählt dann stärker als
+Nebenwörter zu Licht, Bildausschnitt oder Stimmung.
+Bereits gecachte Action-Windows einer Szene bleiben gültige Kandidaten, auch
+wenn sich das aktuelle Sampling-Raster ändert. So verliert der Matcher keine
+teuren Vision-Hinweise und muss dieselben Fenster nicht erneut beschreiben.
+Wenn neue Vision-Calls deaktiviert sind, darf die Recovery vorhandene Cache-
+Beschreibungen trotzdem lesen; das erzeugt keine API-Kosten und verhindert,
+dass alte schwache CV-Treffer stehen bleiben.
+Schlägt die CV-Feinjustierung bei einem semantisch klaren Low-Light-Fenster
+fehl, bleibt das Action-Window als provisorischer Treffer erhalten. CV darf
+einen dunklen Treffer verfeinern, aber nicht einen eindeutigen Cache-Hinweis
+komplett verwerfen.
+Zusätzlich kann Recovery vorhandene gecachte Action-Windows direkt über alle
+Szenen ranken. Dieser schnelle Pfad vermeidet einen teuren Vollscan, wenn der
+Cache bereits eine starke Aktion wie Hand-am-Mund, Kuss oder Blickwechsel
+enthält.
+Eindeutige Begriffe aus der Beat-Beschreibung wirken als harte Filter für
+Vision-Fenster: `mouth` muss im Kandidaten wiederkehren, `dark interior` darf
+nicht auf Outdoor-Material fallen, und markante Personenmerkmale wie `blonde`
+bleiben bindend.
 
 Der zusätzliche Hi-Res-Phasenrefine bleibt lokal um den bereits validierten
 Inpoint und übernimmt nur klare Verbesserungen. Er darf keine ganze lange
diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py
index 6752c65..6355f3e 100644
--- a/src/llm/vision_cache.py
+++ b/src/llm/vision_cache.py
@@ -434,12 +434,20 @@ def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> l
 
     usable_start = scene.start_s
     usable_end = max(scene.start_s, scene.end_s - window_s)
-    if max_windows == 1:
-        starts = [usable_start + (usable_end - usable_start) * 0.5]
-    else:
-        step = (usable_end - usable_start) / max(1, max_windows - 1)
-        starts = [usable_start + step * idx for idx in range(max_windows)]
-    return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts]
+    starts = [usable_start]
+    early_step = max(0.5, window_s * 0.75)
+    for idx in range(1, min(max_windows, 4)):
+        starts.append(min(usable_end, usable_start + early_step * idx))
+    remaining = max_windows - len(starts)
+    if remaining > 0:
+        if remaining == 1:
+            starts.append(usable_start + (usable_end - usable_start) * 0.5)
+        else:
+            step = (usable_end - usable_start) / max(1, remaining - 1)
+            starts.extend(usable_start + step * idx for idx in range(remaining))
+
+    deduped = sorted({round(max(usable_start, min(usable_end, s)), 3) for s in starts})
+    return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in deduped[:max_windows]]
 
 
 def _cached_scene_descriptions(
@@ -749,11 +757,11 @@ def find_action_window_in_scene(
     inside that scene. It stays automatic and cached: windows are described
     evenly across the scene until the per-run vision budget is consumed.
     """
-    if not cfg.vision.enabled or scene.duration_s <= 0:
+    if scene.duration_s <= 0:
         return None
 
     cache = _load_cache(cfg)
-    budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
+    budget = [max(0, cfg.vision.max_new_descriptions_per_run) if cfg.vision.enabled else 0]
     beat_desc = _describe_sample(
         kind="beat",
         item_id=beat.beat_id,
@@ -772,37 +780,37 @@ def find_action_window_in_scene(
     if not beat_actions:
         return None
 
-    max_windows = max(
-        cfg.vision.seed_points_per_scene,
-        cfg.vision.max_new_descriptions_per_run,
-    )
     best: tuple[float, float, float, str] | None = None
-    for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
-        desc = _describe_sample(
-            kind="action_window",
-            item_id=scene.scene_id,
-            label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
-            video_path=scene.source_path,
-            start_s=start_s,
-            end_s=end_s,
-            cfg=cfg,
-            cache=cache,
-            budget=budget,
-        )
+
+    def consider_candidate(start_s: float, end_s: float, desc: str) -> None:
+        nonlocal best
         if not desc:
-            continue
+            return
+        beat_text = beat_desc.lower()
+        source_text = desc.lower()
+        positive_source_text = source_text.split('"negatives"', 1)[0]
+        if "mouth" in beat_text and "mouth" not in positive_source_text:
+            return
+        if "dark interior" in beat_text and (
+            "interior" not in positive_source_text or "dark" not in positive_source_text
+        ):
+            return
+        if "blonde" in beat_text and "blonde" not in positive_source_text:
+            return
         score, reason = _semantic_match_score(beat_desc, desc)
         source_actions = _semantic_action_groups(desc)
         missing_actions = _missing_action_groups(beat_actions, source_actions)
         if missing_actions:
-            continue
+            return
         threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
+        if beat_actions and beat_actions <= source_actions:
+            threshold = min(threshold, max(0.52, cfg.vision.similarity_threshold + 0.05))
         if score < threshold:
-            continue
+            return
         phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc)
         adjusted_score = max(0.0, min(1.0, score + phase_adjustment))
         if adjusted_score < threshold:
-            continue
+            return
         candidate = (
             start_s,
             end_s,
@@ -814,5 +822,87 @@ def find_action_window_in_scene(
         ):
             best = candidate
 
+    max_windows = max(
+        cfg.vision.seed_points_per_scene,
+        cfg.vision.max_new_descriptions_per_run,
+    )
+    ranges = _scene_window_ranges(scene, beat, max_windows)
+    cached_desc_by_range: dict[tuple[float, float], str] = {}
+    cached_items = cache.get("items", {})
+    if isinstance(cached_items, dict):
+        for item in cached_items.values():
+            if not isinstance(item, dict) or item.get("kind") != "action_window":
+                continue
+            if item.get("item_id") != scene.scene_id:
+                continue
+            try:
+                start_s = float(item.get("start_s"))
+                end_s = float(item.get("end_s"))
+            except (TypeError, ValueError):
+                continue
+            if scene.start_s <= start_s < scene.end_s and end_s > start_s:
+                key = (round(start_s, 3), round(min(scene.end_s, end_s), 3))
+                ranges.append(key)
+                description = item.get("description", "")
+                if isinstance(description, str) and description.strip():
+                    cached_desc_by_range[key] = description
+                    consider_candidate(key[0], key[1], description)
+    ranges = sorted({(round(start_s, 3), round(end_s, 3)) for start_s, end_s in ranges})
+
+    for start_s, end_s in ranges:
+        desc = cached_desc_by_range.get((round(start_s, 3), round(end_s, 3)))
+        if desc is None:
+            desc = _describe_sample(
+                kind="action_window",
+                item_id=scene.scene_id,
+                label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
+                video_path=scene.source_path,
+                start_s=start_s,
+                end_s=end_s,
+                cfg=cfg,
+                cache=cache,
+                budget=budget,
+        )
+        if not desc:
+            continue
+        consider_candidate(start_s, end_s, desc)
+
     _save_cache(cfg, cache)
+    if best is None and isinstance(cached_items, dict):
+        for item in cached_items.values():
+            if not isinstance(item, dict) or item.get("kind") != "action_window":
+                continue
+            if item.get("item_id") != scene.scene_id:
+                continue
+            desc = item.get("description", "")
+            if not isinstance(desc, str) or not desc.strip():
+                continue
+            beat_text = beat_desc.lower()
+            source_text = desc.lower()
+            positive_source_text = source_text.split('"negatives"', 1)[0]
+            if "mouth" in beat_text and "mouth" not in positive_source_text:
+                continue
+            if "dark interior" in beat_text and (
+                "interior" not in positive_source_text or "dark" not in positive_source_text
+            ):
+                continue
+            if "blonde" in beat_text and "blonde" not in positive_source_text:
+                continue
+            source_actions = _semantic_action_groups(desc)
+            if not beat_actions or not beat_actions <= source_actions:
+                continue
+            score, reason = _semantic_match_score(beat_desc, desc)
+            if score < max(0.38, cfg.vision.similarity_threshold + 0.05):
+                continue
+            try:
+                start_s = float(item.get("start_s"))
+                end_s = float(item.get("end_s"))
+            except (TypeError, ValueError):
+                continue
+            return (
+                start_s,
+                min(scene.end_s, end_s),
+                min(0.99, score),
+                f"{reason} phase=cached_action_window raw={score:.3f}",
+            )
     return best