From e966a4c3217cdec4c0ea4693b1b09d2b07a8f7aa Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 9 May 2026 18:30:13 +0200 Subject: [PATCH] Filter cached vision action windows --- cli.py | 82 +++++++++++++++++++++- docs/ALGORITHM.md | 26 +++++++ src/llm/vision_cache.py | 146 ++++++++++++++++++++++++++++++++-------- 3 files changed, 224 insertions(+), 30 deletions(-) diff --git a/cli.py b/cli.py index 3c6e071..b6d5a3b 100644 --- a/cli.py +++ b/cli.py @@ -908,7 +908,7 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list Confirmed and provisional matches both stay subject to the same thresholds used elsewhere; this only adds matches that pass the same quality gates. """ - if not cfg.vision.enabled or not beats: + if not beats: return results from dataclasses import replace @@ -977,6 +977,79 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list scenes_by_id = {s.scene_id: s for s in scenes} best = None # (score, scene, in_s, dur_s, reason) + try: + from src.llm.vision_cache import ( + _load_cache, + _semantic_action_groups, + _semantic_match_score, + _STRONG_ACTION_GROUPS, + ) + cache = _load_cache(cfg) + items = cache.get("items", {}) + beat_desc = "" + if isinstance(items, dict): + for item in items.values(): + if ( + isinstance(item, dict) + and item.get("kind") == "beat" + and item.get("item_id") == beat.beat_id + ): + beat_desc = str(item.get("description", "")) + break + beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS if beat_desc else set() + identity_vocab = { + "woman", "women", "man", "men", "girl", "boy", "child", + "blonde", "hair", "face", "mouth", "eyes", "profile", + "close-up", "closeup", + } + beat_identity = {term for term in identity_vocab if term in beat_desc.lower()} + distinctive_identity = { + term for term in ("woman", "women", "blonde", "mouth", "face") + if term in beat_desc.lower() + } + if beat_actions and isinstance(items, dict): + for item in items.values(): + if not isinstance(item, dict) or item.get("kind") != "action_window": + continue + scene = scenes_by_id.get(item.get("item_id")) + desc = str(item.get("description", "")) + source_actions = _semantic_action_groups(desc) + if scene is None or not beat_actions <= source_actions: + continue + source_text = desc.lower() + positive_source_text = source_text.split('"negatives"', 1)[0] + identity_overlap = {term for term in beat_identity if term in source_text} + if len(beat_identity) >= 2 and len(identity_overlap) < 2: + continue + if distinctive_identity and not any(term in positive_source_text for term in distinctive_identity): + continue + if "mouth" in beat_desc.lower() and "mouth" not in positive_source_text: + continue + if "dark interior" in beat_desc.lower() and ( + "interior" not in positive_source_text or "dark" not in positive_source_text + ): + continue + score, reason = _semantic_match_score(beat_desc, desc) + if score < max(0.60, cfg.cv.deep_scan.provisional_match_threshold): + continue + try: + in_s = float(item.get("start_s")) + out_s = float(item.get("end_s")) + except (TypeError, ValueError): + continue + duration_s = max(0.32, min(anchor_beat.duration_s, out_s - in_s)) + candidate = ( + min(0.99, score), + scene, + in_s, + duration_s, + f"cached vision action; {reason}", + ) + if best is None or candidate[0] > best[0]: + best = candidate + except Exception as exc: + logger.debug("Beat %d: cached vision fallback failed (%s)", beat.beat_id, exc) + seen = set() for hit in hits[: cfg.cv.deep_scan.scene_seed_top_k]: scene = scenes_by_id.get(hit.scene_id) @@ -1003,7 +1076,10 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list ) except Exception as exc: logger.debug("Beat %d: align failed for scene %d (%s)", beat.beat_id, scene.scene_id, exc) - continue + aligned_in_s = start_s + combined_score = semantic_score + content_score = 0.0 + motion_score = 0.0 aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - anchor_beat.duration_s))) try: @@ -1033,6 +1109,8 @@ def _recover_unmatched_beats_via_vision(results: list, beats: list, cfg) -> list combined_score, min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08), ) + if semantic_score >= max(0.60, cfg.cv.deep_scan.provisional_match_threshold): + final_score = max(final_score, semantic_score) if final_score < cfg.cv.deep_scan.provisional_match_threshold: continue candidate = (final_score, scene, aligned_in_s, usable_duration_s, f"recovery; {reason}; {verify_reason}") diff --git a/docs/ALGORITHM.md b/docs/ALGORITHM.md index 5f6c0e6..69555e1 100644 --- a/docs/ALGORITHM.md +++ b/docs/ALGORITHM.md @@ -194,6 +194,32 @@ Die Vision-Recovery läuft nicht nur für komplett fehlende Beats, sondern auch für schwache unbestätigte Treffer. Gerade Low-Light-Beats dürfen nicht an einem falschen dunklen CV-Treffer hängen bleiben, wenn der Cache semantisch eine bessere Handlungsphase kennt. +Bei langen Source-Szenen prüft die Action-Window-Suche immer den Szenenanfang +und mehrere frühe Fenster, bevor sie gleichmäßig über die ganze Szene sampelt. +Damit gehen kurze Trailer-Aktionen am Anfang einer langen Szene nicht unter, +wenn der Rest der Szene aus Credits, Schwarzbild oder ruhigen Folgeframes +besteht. +Wenn ein Action-Window die starke Beat-Aktion explizit enthält, darf es eine +etwas niedrigere Textähnlichkeit haben; die Handlung zählt dann stärker als +Nebenwörter zu Licht, Bildausschnitt oder Stimmung. +Bereits gecachte Action-Windows einer Szene bleiben gültige Kandidaten, auch +wenn sich das aktuelle Sampling-Raster ändert. So verliert der Matcher keine +teuren Vision-Hinweise und muss dieselben Fenster nicht erneut beschreiben. +Wenn neue Vision-Calls deaktiviert sind, darf die Recovery vorhandene Cache- +Beschreibungen trotzdem lesen; das erzeugt keine API-Kosten und verhindert, +dass alte schwache CV-Treffer stehen bleiben. +Schlägt die CV-Feinjustierung bei einem semantisch klaren Low-Light-Fenster +fehl, bleibt das Action-Window als provisorischer Treffer erhalten. CV darf +einen dunklen Treffer verfeinern, aber nicht einen eindeutigen Cache-Hinweis +komplett verwerfen. +Zusätzlich kann Recovery vorhandene gecachte Action-Windows direkt über alle +Szenen ranken. Dieser schnelle Pfad vermeidet einen teuren Vollscan, wenn der +Cache bereits eine starke Aktion wie Hand-am-Mund, Kuss oder Blickwechsel +enthält. +Eindeutige Begriffe aus der Beat-Beschreibung wirken als harte Filter für +Vision-Fenster: `mouth` muss im Kandidaten wiederkehren, `dark interior` darf +nicht auf Outdoor-Material fallen, und markante Personenmerkmale wie `blonde` +bleiben bindend. Der zusätzliche Hi-Res-Phasenrefine bleibt lokal um den bereits validierten Inpoint und übernimmt nur klare Verbesserungen. Er darf keine ganze lange diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index 6752c65..6355f3e 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -434,12 +434,20 @@ def _scene_window_ranges(scene: Scene, beat: TrailerBeat, max_windows: int) -> l usable_start = scene.start_s usable_end = max(scene.start_s, scene.end_s - window_s) - if max_windows == 1: - starts = [usable_start + (usable_end - usable_start) * 0.5] - else: - step = (usable_end - usable_start) / max(1, max_windows - 1) - starts = [usable_start + step * idx for idx in range(max_windows)] - return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in starts] + starts = [usable_start] + early_step = max(0.5, window_s * 0.75) + for idx in range(1, min(max_windows, 4)): + starts.append(min(usable_end, usable_start + early_step * idx)) + remaining = max_windows - len(starts) + if remaining > 0: + if remaining == 1: + starts.append(usable_start + (usable_end - usable_start) * 0.5) + else: + step = (usable_end - usable_start) / max(1, remaining - 1) + starts.extend(usable_start + step * idx for idx in range(remaining)) + + deduped = sorted({round(max(usable_start, min(usable_end, s)), 3) for s in starts}) + return [(start_s, min(scene.end_s, start_s + window_s)) for start_s in deduped[:max_windows]] def _cached_scene_descriptions( @@ -749,11 +757,11 @@ def find_action_window_in_scene( inside that scene. It stays automatic and cached: windows are described evenly across the scene until the per-run vision budget is consumed. """ - if not cfg.vision.enabled or scene.duration_s <= 0: + if scene.duration_s <= 0: return None cache = _load_cache(cfg) - budget = [max(0, cfg.vision.max_new_descriptions_per_run)] + budget = [max(0, cfg.vision.max_new_descriptions_per_run) if cfg.vision.enabled else 0] beat_desc = _describe_sample( kind="beat", item_id=beat.beat_id, @@ -772,37 +780,37 @@ def find_action_window_in_scene( if not beat_actions: return None - max_windows = max( - cfg.vision.seed_points_per_scene, - cfg.vision.max_new_descriptions_per_run, - ) best: tuple[float, float, float, str] | None = None - for start_s, end_s in _scene_window_ranges(scene, beat, max_windows): - desc = _describe_sample( - kind="action_window", - item_id=scene.scene_id, - label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}", - video_path=scene.source_path, - start_s=start_s, - end_s=end_s, - cfg=cfg, - cache=cache, - budget=budget, - ) + + def consider_candidate(start_s: float, end_s: float, desc: str) -> None: + nonlocal best if not desc: - continue + return + beat_text = beat_desc.lower() + source_text = desc.lower() + positive_source_text = source_text.split('"negatives"', 1)[0] + if "mouth" in beat_text and "mouth" not in positive_source_text: + return + if "dark interior" in beat_text and ( + "interior" not in positive_source_text or "dark" not in positive_source_text + ): + return + if "blonde" in beat_text and "blonde" not in positive_source_text: + return score, reason = _semantic_match_score(beat_desc, desc) source_actions = _semantic_action_groups(desc) missing_actions = _missing_action_groups(beat_actions, source_actions) if missing_actions: - continue + return threshold = max(0.38, cfg.vision.similarity_threshold + 0.18) + if beat_actions and beat_actions <= source_actions: + threshold = min(threshold, max(0.52, cfg.vision.similarity_threshold + 0.05)) if score < threshold: - continue + return phase_adjustment, phase_reason = _action_phase_adjustment(beat_desc, desc) adjusted_score = max(0.0, min(1.0, score + phase_adjustment)) if adjusted_score < threshold: - continue + return candidate = ( start_s, end_s, @@ -814,5 +822,87 @@ def find_action_window_in_scene( ): best = candidate + max_windows = max( + cfg.vision.seed_points_per_scene, + cfg.vision.max_new_descriptions_per_run, + ) + ranges = _scene_window_ranges(scene, beat, max_windows) + cached_desc_by_range: dict[tuple[float, float], str] = {} + cached_items = cache.get("items", {}) + if isinstance(cached_items, dict): + for item in cached_items.values(): + if not isinstance(item, dict) or item.get("kind") != "action_window": + continue + if item.get("item_id") != scene.scene_id: + continue + try: + start_s = float(item.get("start_s")) + end_s = float(item.get("end_s")) + except (TypeError, ValueError): + continue + if scene.start_s <= start_s < scene.end_s and end_s > start_s: + key = (round(start_s, 3), round(min(scene.end_s, end_s), 3)) + ranges.append(key) + description = item.get("description", "") + if isinstance(description, str) and description.strip(): + cached_desc_by_range[key] = description + consider_candidate(key[0], key[1], description) + ranges = sorted({(round(start_s, 3), round(end_s, 3)) for start_s, end_s in ranges}) + + for start_s, end_s in ranges: + desc = cached_desc_by_range.get((round(start_s, 3), round(end_s, 3))) + if desc is None: + desc = _describe_sample( + kind="action_window", + item_id=scene.scene_id, + label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}", + video_path=scene.source_path, + start_s=start_s, + end_s=end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not desc: + continue + consider_candidate(start_s, end_s, desc) + _save_cache(cfg, cache) + if best is None and isinstance(cached_items, dict): + for item in cached_items.values(): + if not isinstance(item, dict) or item.get("kind") != "action_window": + continue + if item.get("item_id") != scene.scene_id: + continue + desc = item.get("description", "") + if not isinstance(desc, str) or not desc.strip(): + continue + beat_text = beat_desc.lower() + source_text = desc.lower() + positive_source_text = source_text.split('"negatives"', 1)[0] + if "mouth" in beat_text and "mouth" not in positive_source_text: + continue + if "dark interior" in beat_text and ( + "interior" not in positive_source_text or "dark" not in positive_source_text + ): + continue + if "blonde" in beat_text and "blonde" not in positive_source_text: + continue + source_actions = _semantic_action_groups(desc) + if not beat_actions or not beat_actions <= source_actions: + continue + score, reason = _semantic_match_score(beat_desc, desc) + if score < max(0.38, cfg.vision.similarity_threshold + 0.05): + continue + try: + start_s = float(item.get("start_s")) + end_s = float(item.get("end_s")) + except (TypeError, ValueError): + continue + return ( + start_s, + min(scene.end_s, end_s), + min(0.99, score), + f"{reason} phase=cached_action_window raw={score:.3f}", + ) return best