From 1a177d6b891e80f2bcf95d51503afe6f624ddbfa Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 16:49:47 +0200 Subject: [PATCH] Reject vision matches with action phase mismatches --- README.md | 5 ++ cli.py | 61 +++++++++++++++++++++ src/llm/vision_cache.py | 114 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 177 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6e46c5a..e53c029 100644 --- a/README.md +++ b/README.md @@ -159,6 +159,11 @@ Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen Bewegungs- oder Dialogmoment gleichgesetzt wird. +Nach dem CV-Match kann derselbe Vision-Layer den konkreten finalen Source- +Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie +Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch +im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer +nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete diff --git a/cli.py b/cli.py index 1897b9b..9024aa3 100644 --- a/cli.py +++ b/cli.py @@ -632,6 +632,66 @@ def _merge_best_results(existing: list, candidates: list, cfg) -> list: return sorted(by_id.values(), key=lambda r: r.beat_id) +def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) -> list: + """Drop vision-enabled matches whose final action phase contradicts the beat.""" + if not cfg.vision.enabled or not results: + return results + + from dataclasses import replace + from src.llm.vision_cache import validate_match_window_with_vision + + logger = logging.getLogger(__name__) + beats_by_id = {beat.beat_id: beat for beat in beats} + kept = [] + for result in results: + beat = beats_by_id.get(result.beat_id) + if beat is None: + kept.append(result) + continue + + windows = [] + if getattr(result, "segments", ()): + for segment in result.segments: + segment_beat = replace( + beat, + start_s=beat.start_s + segment.trailer_offset_s, + end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s, + ) + windows.append(( + segment_beat, + segment.scene_id, + segment.in_point_s, + segment.out_point_s, + )) + else: + windows.append((beat, result.scene_id, result.in_point_s, result.out_point_s)) + + valid = True + reasons: list[str] = [] + for check_beat, scene_id, in_point_s, out_point_s in windows: + ok, reason = validate_match_window_with_vision( + check_beat, + source_path=result.source_path, + scene_id=scene_id, + in_point_s=in_point_s, + out_point_s=out_point_s, + cfg=cfg, + ) + reasons.append(reason) + if not ok: + valid = False + break + if valid: + kept.append(result) + else: + logger.warning( + "Beat %d: rejected by vision action-phase verification (%s)", + result.beat_id, + "; ".join(reasons), + ) + return kept + + def _attach_visual_segments(results: list, beats: list, cfg) -> list: """Attach automatic sub-shot matches for multi-island trailer beats.""" from dataclasses import replace @@ -976,6 +1036,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list: skip_global_segment_scan_for=set(single_island_trims), ) results = _attach_visual_segments(results, beats, cfg) + results = _filter_semantically_invalid_vision_matches(results, beats, cfg) # A targeted one-beat match should improve the cache without deleting # automatic matches for other beats. diff --git a/src/llm/vision_cache.py b/src/llm/vision_cache.py index fde8fe3..2632179 100644 --- a/src/llm/vision_cache.py +++ b/src/llm/vision_cache.py @@ -50,6 +50,18 @@ _CREDIT_ERROR_PATTERNS = ( "payment required", ) +_ACTION_GROUPS = { + "kiss": {"kiss", "kisses", "kissing", "kissed"}, + "forehead_touch": {"forehead", "foreheads", "touch", "touches", "touching", "touched"}, + "approach": {"approach", "approaches", "approaching", "closer", "lean", "leans", "leaning"}, + "talk": {"talk", "talking", "speak", "speaking", "conversation", "conversing"}, + "hand": {"hand", "hands", "holding", "holds", "raise", "raises", "raising", "lift", "lifting"}, + "cutting": {"cut", "cuts", "cutting", "knife", "blade", "scissors"}, + "look_down": {"down", "lowering", "lowers"}, + "turn": {"turn", "turns", "turning"}, +} +_STRONG_ACTION_GROUPS = {"kiss", "forehead_touch", "approach", "hand", "cutting"} + def _cache_path(cfg: AppConfig) -> Path: return cfg.paths.cache_dir / "vision_descriptions.json" @@ -251,6 +263,40 @@ def _text_similarity(a: str, b: str) -> float: return float(overlap / max(8, min(len(ta), len(tb)))) +def _semantic_action_groups(text: str) -> set[str]: + terms = _terms(text) + lowered = text.lower() + groups = { + name + for name, needles in _ACTION_GROUPS.items() + if terms & needles + } + if "moving closer" in lowered or "move closer" in lowered: + groups.add("approach") + if "face-to-face" in lowered or "faces facing" in lowered: + groups.add("approach") + return groups + + +def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, str]: + text_score = _text_similarity(beat_desc, candidate_desc) + beat_actions = _semantic_action_groups(beat_desc) + candidate_actions = _semantic_action_groups(candidate_desc) + required = beat_actions & _STRONG_ACTION_GROUPS + missing = required - candidate_actions + if missing: + penalty = min(0.45, 0.18 * len(missing)) + text_score = max(0.0, text_score - penalty) + if required and not missing: + text_score = min(1.0, text_score + 0.12) + reason = ( + f"semantic={text_score:.3f} " + f"beat_actions={sorted(beat_actions)} " + f"candidate_actions={sorted(candidate_actions)}" + ) + return text_score, reason + + def _scene_seed_points(scene: Scene, max_points: int) -> list[float]: if max_points <= 1 or scene.duration_s <= 0: return [scene.start_s] @@ -340,7 +386,7 @@ def _add_window_seed_descriptions( ) if not desc: continue - score = _text_similarity(beat_desc, desc) + score, _reason = _semantic_match_score(beat_desc, desc) if score < cfg.vision.similarity_threshold: continue semantic_score = min(0.99, score + 0.30) @@ -427,7 +473,7 @@ def build_vision_seed_in_points( ) if not scene_desc: continue - score = _text_similarity(beat_desc, scene_desc) + score, _reason = _semantic_match_score(beat_desc, scene_desc) if score >= cfg.vision.similarity_threshold: ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision") @@ -440,7 +486,7 @@ def build_vision_seed_in_points( ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe") for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg): - score = _text_similarity(beat_desc, scene_desc) + score, _reason = _semantic_match_score(beat_desc, scene_desc) if score < cfg.vision.similarity_threshold: continue semantic_score = min(0.99, score + 0.25) @@ -487,3 +533,65 @@ def build_vision_seed_in_points( _save_cache(cfg, cache) return seeds + + +def validate_match_window_with_vision( + beat: TrailerBeat, + *, + source_path: Path, + scene_id: int, + in_point_s: float, + out_point_s: float, + cfg: AppConfig, +) -> tuple[bool, str]: + """ + Verify a final match window with cached vision descriptions. + + This is a conservative safety net for visually ambiguous low-light shots: + CV may confirm a similar-looking two-shot, but the action phase still has + to agree with the beat description. Descriptions are cached by exact window. + """ + if not cfg.vision.enabled: + return True, "vision disabled" + if out_point_s <= in_point_s: + return False, "empty source window" + + cache = _load_cache(cfg) + budget = [min(2, max(0, cfg.vision.max_new_descriptions_per_run))] + beat_desc = _describe_sample( + kind="beat", + item_id=beat.beat_id, + label=f"trailer beat {beat.beat_id} verification", + video_path=beat.trailer_path, + start_s=beat.start_s, + end_s=beat.end_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + if not beat_desc: + return True, "no beat vision description" + + source_desc = _describe_sample( + kind="match_window", + item_id=scene_id, + label=f"matched source scene {scene_id} window {in_point_s:.2f}-{out_point_s:.2f}", + video_path=source_path, + start_s=in_point_s, + end_s=out_point_s, + cfg=cfg, + cache=cache, + budget=budget, + ) + _save_cache(cfg, cache) + if not source_desc: + return True, "no source vision description" + + score, reason = _semantic_match_score(beat_desc, source_desc) + beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS + source_actions = _semantic_action_groups(source_desc) + missing_actions = beat_actions - source_actions + threshold = max(0.32, cfg.vision.similarity_threshold + 0.12) + if missing_actions and score < threshold: + return False, f"{reason} missing_actions={sorted(missing_actions)}" + return True, reason