Reject vision matches with action phase mismatches

2026-05-02 16:49:47 +02:00
parent d9e470c877
commit 1a177d6b89
3 changed files with 177 additions and 3 deletions
@@ -159,6 +159,11 @@ Vision-Modell stammen. Bei langen semantisch passenden Source-Szenen beschreibt
 der Vision-Layer zusätzlich wenige lokale Zeitfenster und cached auch diese
 Fenster, damit eine grob ähnliche Szene nicht automatisch mit dem falschen
 Bewegungs- oder Dialogmoment gleichgesetzt wird.
+Nach dem CV-Match kann derselbe Vision-Layer den konkreten finalen Source-
+Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
+Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
+im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
+nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -632,6 +632,66 @@ def _merge_best_results(existing: list, candidates: list, cfg) -> list:
    return sorted(by_id.values(), key=lambda r: r.beat_id)


+def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) -> list:
+    """Drop vision-enabled matches whose final action phase contradicts the beat."""
+    if not cfg.vision.enabled or not results:
+        return results
+
+    from dataclasses import replace
+    from src.llm.vision_cache import validate_match_window_with_vision
+
+    logger = logging.getLogger(__name__)
+    beats_by_id = {beat.beat_id: beat for beat in beats}
+    kept = []
+    for result in results:
+        beat = beats_by_id.get(result.beat_id)
+        if beat is None:
+            kept.append(result)
+            continue
+
+        windows = []
+        if getattr(result, "segments", ()):
+            for segment in result.segments:
+                segment_beat = replace(
+                    beat,
+                    start_s=beat.start_s + segment.trailer_offset_s,
+                    end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
+                )
+                windows.append((
+                    segment_beat,
+                    segment.scene_id,
+                    segment.in_point_s,
+                    segment.out_point_s,
+                ))
+        else:
+            windows.append((beat, result.scene_id, result.in_point_s, result.out_point_s))
+
+        valid = True
+        reasons: list[str] = []
+        for check_beat, scene_id, in_point_s, out_point_s in windows:
+            ok, reason = validate_match_window_with_vision(
+                check_beat,
+                source_path=result.source_path,
+                scene_id=scene_id,
+                in_point_s=in_point_s,
+                out_point_s=out_point_s,
+                cfg=cfg,
+            )
+            reasons.append(reason)
+            if not ok:
+                valid = False
+                break
+        if valid:
+            kept.append(result)
+        else:
+            logger.warning(
+                "Beat %d: rejected by vision action-phase verification (%s)",
+                result.beat_id,
+                "; ".join(reasons),
+            )
+    return kept
+
+
 def _attach_visual_segments(results: list, beats: list, cfg) -> list:
    """Attach automatic sub-shot matches for multi-island trailer beats."""
    from dataclasses import replace
@@ -976,6 +1036,7 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
        skip_global_segment_scan_for=set(single_island_trims),
    )
    results = _attach_visual_segments(results, beats, cfg)
+    results = _filter_semantically_invalid_vision_matches(results, beats, cfg)

    # A targeted one-beat match should improve the cache without deleting
    # automatic matches for other beats.
@@ -50,6 +50,18 @@ _CREDIT_ERROR_PATTERNS = (
    "payment required",
 )

+_ACTION_GROUPS = {
+    "kiss": {"kiss", "kisses", "kissing", "kissed"},
+    "forehead_touch": {"forehead", "foreheads", "touch", "touches", "touching", "touched"},
+    "approach": {"approach", "approaches", "approaching", "closer", "lean", "leans", "leaning"},
+    "talk": {"talk", "talking", "speak", "speaking", "conversation", "conversing"},
+    "hand": {"hand", "hands", "holding", "holds", "raise", "raises", "raising", "lift", "lifting"},
+    "cutting": {"cut", "cuts", "cutting", "knife", "blade", "scissors"},
+    "look_down": {"down", "lowering", "lowers"},
+    "turn": {"turn", "turns", "turning"},
+}
+_STRONG_ACTION_GROUPS = {"kiss", "forehead_touch", "approach", "hand", "cutting"}
+

 def _cache_path(cfg: AppConfig) -> Path:
    return cfg.paths.cache_dir / "vision_descriptions.json"
@@ -251,6 +263,40 @@ def _text_similarity(a: str, b: str) -> float:
    return float(overlap / max(8, min(len(ta), len(tb))))


+def _semantic_action_groups(text: str) -> set[str]:
+    terms = _terms(text)
+    lowered = text.lower()
+    groups = {
+        name
+        for name, needles in _ACTION_GROUPS.items()
+        if terms & needles
+    }
+    if "moving closer" in lowered or "move closer" in lowered:
+        groups.add("approach")
+    if "face-to-face" in lowered or "faces facing" in lowered:
+        groups.add("approach")
+    return groups
+
+
+def _semantic_match_score(beat_desc: str, candidate_desc: str) -> tuple[float, str]:
+    text_score = _text_similarity(beat_desc, candidate_desc)
+    beat_actions = _semantic_action_groups(beat_desc)
+    candidate_actions = _semantic_action_groups(candidate_desc)
+    required = beat_actions & _STRONG_ACTION_GROUPS
+    missing = required - candidate_actions
+    if missing:
+        penalty = min(0.45, 0.18 * len(missing))
+        text_score = max(0.0, text_score - penalty)
+    if required and not missing:
+        text_score = min(1.0, text_score + 0.12)
+    reason = (
+        f"semantic={text_score:.3f} "
+        f"beat_actions={sorted(beat_actions)} "
+        f"candidate_actions={sorted(candidate_actions)}"
+    )
+    return text_score, reason
+
+
 def _scene_seed_points(scene: Scene, max_points: int) -> list[float]:
    if max_points <= 1 or scene.duration_s <= 0:
        return [scene.start_s]
@@ -340,7 +386,7 @@ def _add_window_seed_descriptions(
            )
            if not desc:
                continue
-            score = _text_similarity(beat_desc, desc)
+            score, _reason = _semantic_match_score(beat_desc, desc)
            if score < cfg.vision.similarity_threshold:
                continue
            semantic_score = min(0.99, score + 0.30)
@@ -427,7 +473,7 @@ def build_vision_seed_in_points(
            )
            if not scene_desc:
                continue
-            score = _text_similarity(beat_desc, scene_desc)
+            score, _reason = _semantic_match_score(beat_desc, scene_desc)
            if score >= cfg.vision.similarity_threshold:
                ranked_by_scene[scene.scene_id] = (min(0.99, score + 0.25), scene, "vision")

@@ -440,7 +486,7 @@ def build_vision_seed_in_points(
                ranked_by_scene[scene.scene_id] = (vibe_score, scene, "vibe")

        for scene, scene_desc in _cached_scene_descriptions(cache, scenes_by_id, cfg):
-            score = _text_similarity(beat_desc, scene_desc)
+            score, _reason = _semantic_match_score(beat_desc, scene_desc)
            if score < cfg.vision.similarity_threshold:
                continue
            semantic_score = min(0.99, score + 0.25)
@@ -487,3 +533,65 @@ def build_vision_seed_in_points(

    _save_cache(cfg, cache)
    return seeds
+
+
+def validate_match_window_with_vision(
+    beat: TrailerBeat,
+    *,
+    source_path: Path,
+    scene_id: int,
+    in_point_s: float,
+    out_point_s: float,
+    cfg: AppConfig,
+) -> tuple[bool, str]:
+    """
+    Verify a final match window with cached vision descriptions.
+
+    This is a conservative safety net for visually ambiguous low-light shots:
+    CV may confirm a similar-looking two-shot, but the action phase still has
+    to agree with the beat description. Descriptions are cached by exact window.
+    """
+    if not cfg.vision.enabled:
+        return True, "vision disabled"
+    if out_point_s <= in_point_s:
+        return False, "empty source window"
+
+    cache = _load_cache(cfg)
+    budget = [min(2, max(0, cfg.vision.max_new_descriptions_per_run))]
+    beat_desc = _describe_sample(
+        kind="beat",
+        item_id=beat.beat_id,
+        label=f"trailer beat {beat.beat_id} verification",
+        video_path=beat.trailer_path,
+        start_s=beat.start_s,
+        end_s=beat.end_s,
+        cfg=cfg,
+        cache=cache,
+        budget=budget,
+    )
+    if not beat_desc:
+        return True, "no beat vision description"
+
+    source_desc = _describe_sample(
+        kind="match_window",
+        item_id=scene_id,
+        label=f"matched source scene {scene_id} window {in_point_s:.2f}-{out_point_s:.2f}",
+        video_path=source_path,
+        start_s=in_point_s,
+        end_s=out_point_s,
+        cfg=cfg,
+        cache=cache,
+        budget=budget,
+    )
+    _save_cache(cfg, cache)
+    if not source_desc:
+        return True, "no source vision description"
+
+    score, reason = _semantic_match_score(beat_desc, source_desc)
+    beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
+    source_actions = _semantic_action_groups(source_desc)
+    missing_actions = beat_actions - source_actions
+    threshold = max(0.32, cfg.vision.similarity_threshold + 0.12)
+    if missing_actions and score < threshold:
+        return False, f"{reason} missing_actions={sorted(missing_actions)}"
+    return True, reason