Realign wrong in-scene action matches

2026-05-02 17:13:22 +02:00
parent 1a177d6b89
commit 3ea5582b49
3 changed files with 178 additions and 1 deletions
@@ -164,6 +164,11 @@ Zeitbereich nochmals gegen den Trailer-Beat prüfen. Starke Aktionsphasen wie
 Annäherung, Kuss/Stirnkontakt, Handbewegungen oder Schneiden müssen dann auch
 im Source-Fenster beschrieben sein; fehlt diese Aktionsphase, wird der Treffer
 nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
+Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
+Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
+Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
+den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert,
+wird der Treffer verworfen.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -638,10 +638,50 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
        return results

    from dataclasses import replace
-    from src.llm.vision_cache import validate_match_window_with_vision
+    from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
+    from src.cv.scene_indexer import build_scene_index
+    from src.cv.global_scan import align_in_point_by_content

    logger = logging.getLogger(__name__)
    beats_by_id = {beat.beat_id: beat for beat in beats}
+    scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
+
+    def realign_window(check_beat, scene_id: int):
+        scene = scenes_by_id.get(scene_id)
+        if scene is None:
+            return None
+        found = find_action_window_in_scene(check_beat, scene, cfg)
+        if found is None:
+            return None
+        start_s, end_s, semantic_score, reason = found
+        window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
+        aligned_in_s, content_score = align_in_point_by_content(
+            check_beat,
+            start_s,
+            cfg,
+            search_window_s=window_s,
+        )
+        aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
+        ok, verify_reason = validate_match_window_with_vision(
+            check_beat,
+            source_path=scene.source_path,
+            scene_id=scene.scene_id,
+            in_point_s=aligned_in_s,
+            out_point_s=aligned_in_s + check_beat.duration_s,
+            cfg=cfg,
+        )
+        if not ok:
+            logger.info(
+                "Beat %d: action-window realign rejected scene=%d in=%.3fs (%s)",
+                check_beat.beat_id,
+                scene.scene_id,
+                aligned_in_s,
+                verify_reason,
+            )
+            return None
+        score = max(content_score, min(0.99, semantic_score))
+        return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
+
    kept = []
    for result in results:
        beat = beats_by_id.get(result.beat_id)
@@ -684,6 +724,68 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
        if valid:
            kept.append(result)
        else:
+            if getattr(result, "segments", ()):
+                new_segments = []
+                all_repaired = True
+                repair_reasons = []
+                for segment in result.segments:
+                    segment_beat = replace(
+                        beat,
+                        start_s=beat.start_s + segment.trailer_offset_s,
+                        end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
+                    )
+                    repair = realign_window(segment_beat, segment.scene_id)
+                    if repair is None:
+                        all_repaired = False
+                        break
+                    scene, aligned_in_s, score, repair_reason = repair
+                    repair_reasons.append(repair_reason)
+                    new_segments.append(replace(
+                        segment,
+                        scene_id=scene.scene_id,
+                        in_point_s=aligned_in_s,
+                        out_point_s=aligned_in_s + segment.duration_s,
+                        match_score=score,
+                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
+                    ))
+                if all_repaired and new_segments:
+                    first = new_segments[0]
+                    repaired_score = min(seg.match_score for seg in new_segments)
+                    logger.info(
+                        "Beat %d: realigned inside matched scene by vision action windows (%s)",
+                        result.beat_id,
+                        "; ".join(repair_reasons),
+                    )
+                    kept.append(replace(
+                        result,
+                        scene_id=first.scene_id,
+                        in_point_s=first.in_point_s,
+                        out_point_s=first.out_point_s,
+                        in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
+                        match_score=repaired_score,
+                        is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
+                        segments=tuple(new_segments),
+                    ))
+                    continue
+            else:
+                repair = realign_window(beat, result.scene_id)
+                if repair is not None:
+                    scene, aligned_in_s, score, repair_reason = repair
+                    logger.info(
+                        "Beat %d: realigned inside matched scene by vision action window (%s)",
+                        result.beat_id,
+                        repair_reason,
+                    )
+                    kept.append(replace(
+                        result,
+                        scene_id=scene.scene_id,
+                        in_point_s=aligned_in_s,
+                        out_point_s=aligned_in_s + result.duration_s,
+                        in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
+                        match_score=score,
+                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
+                    ))
+                    continue
            logger.warning(
                "Beat %d: rejected by vision action-phase verification (%s)",
                result.beat_id,
@@ -595,3 +595,73 @@ def validate_match_window_with_vision(
    if missing_actions and score < threshold:
        return False, f"{reason} missing_actions={sorted(missing_actions)}"
    return True, reason
+
+
+def find_action_window_in_scene(
+    beat: TrailerBeat,
+    scene: Scene,
+    cfg: AppConfig,
+) -> tuple[float, float, float, str] | None:
+    """
+    Search one already selected source scene for the beat's action phase.
+
+    This is used after CV picked the right broad scene but the wrong time
+    inside that scene. It stays automatic and cached: windows are described
+    evenly across the scene until the per-run vision budget is consumed.
+    """
+    if not cfg.vision.enabled or scene.duration_s <= 0:
+        return None
+
+    cache = _load_cache(cfg)
+    budget = [max(0, cfg.vision.max_new_descriptions_per_run)]
+    beat_desc = _describe_sample(
+        kind="beat",
+        item_id=beat.beat_id,
+        label=f"trailer beat {beat.beat_id} action search",
+        video_path=beat.trailer_path,
+        start_s=beat.start_s,
+        end_s=beat.end_s,
+        cfg=cfg,
+        cache=cache,
+        budget=budget,
+    )
+    if not beat_desc:
+        return None
+
+    beat_actions = _semantic_action_groups(beat_desc) & _STRONG_ACTION_GROUPS
+    if not beat_actions:
+        return None
+
+    max_windows = max(
+        cfg.vision.seed_points_per_scene,
+        cfg.vision.max_new_descriptions_per_run,
+    )
+    best: tuple[float, float, float, str] | None = None
+    for start_s, end_s in _scene_window_ranges(scene, beat, max_windows):
+        desc = _describe_sample(
+            kind="action_window",
+            item_id=scene.scene_id,
+            label=f"source scene {scene.scene_id} action window {start_s:.2f}-{end_s:.2f}",
+            video_path=scene.source_path,
+            start_s=start_s,
+            end_s=end_s,
+            cfg=cfg,
+            cache=cache,
+            budget=budget,
+        )
+        if not desc:
+            continue
+        score, reason = _semantic_match_score(beat_desc, desc)
+        source_actions = _semantic_action_groups(desc)
+        missing_actions = beat_actions - source_actions
+        if missing_actions:
+            continue
+        threshold = max(0.38, cfg.vision.similarity_threshold + 0.18)
+        if score < threshold:
+            continue
+        candidate = (start_s, end_s, score, reason)
+        if best is None or candidate[2] > best[2]:
+            best = candidate
+
+    _save_cache(cfg, cache)
+    return best