Trim retimed segments when phase drifts

2026-05-02 23:04:41 +02:00
parent e293835a86
commit 2cc05e4737
2 changed files with 42 additions and 15 deletions
@@ -191,6 +191,11 @@ sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
 Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
 eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
 beginnt.
+Der Segment-Offset zählt dabei nur über vorherige scorebare Bildinseln, nicht
+über schwarze oder blendige Lücken. Nach dem Retiming wird die nutzbare
+Source-Dauer erneut geschätzt; läuft die Source am Ende in eine sichtbar andere
+Aktionsphase, wird der Clip gekürzt und der Rest bleibt Placeholder/Fade statt
+einen falschen Bewegungsmoment zu zeigen.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -640,12 +640,24 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
    from dataclasses import replace
    from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
    from src.cv.scene_indexer import build_scene_index
-    from src.cv.global_scan import align_in_point_by_content_and_motion
+    from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration

    logger = logging.getLogger(__name__)
    beats_by_id = {beat.beat_id: beat for beat in beats}
    scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}

+    def visible_content_offset(action_beat, segment_start_offset_s: float) -> float:
+        content_offset_s = 0.0
+        for start_s, end_s in _reference_scoreable_segments(action_beat, cfg):
+            if end_s <= segment_start_offset_s:
+                content_offset_s += max(0.0, end_s - start_s)
+            elif start_s < segment_start_offset_s:
+                content_offset_s += max(0.0, segment_start_offset_s - start_s)
+                break
+            else:
+                break
+        return content_offset_s
+
    def realign_window(check_beat, scene_id: int, action_beat=None):
        scene = scenes_by_id.get(scene_id)
        if scene is None:
@@ -655,9 +667,10 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
            return None
        start_s, end_s, semantic_score, reason = found
        if action_beat is not None:
-            offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s)
-            start_s += offset_delta_s
-            end_s += offset_delta_s
+            segment_start_offset_s = max(0.0, check_beat.start_s - action_beat.start_s)
+            content_offset_s = visible_content_offset(action_beat, segment_start_offset_s)
+            start_s += content_offset_s
+            end_s += content_offset_s
        window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
        aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
            check_beat,
@@ -666,12 +679,16 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
            search_window_s=window_s,
        )
        aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
+        usable_duration_s, usable_score = estimate_usable_source_duration(check_beat, aligned_in_s, cfg)
+        usable_duration_s = max(0.0, min(check_beat.duration_s, usable_duration_s))
+        if usable_duration_s < max(0.32, check_beat.duration_s * 0.45):
+            usable_duration_s = check_beat.duration_s
        ok, verify_reason = validate_match_window_with_vision(
            check_beat,
            source_path=scene.source_path,
            scene_id=scene.scene_id,
            in_point_s=aligned_in_s,
-            out_point_s=aligned_in_s + check_beat.duration_s,
+            out_point_s=aligned_in_s + usable_duration_s,
            cfg=cfg,
        )
        if not ok:
@@ -683,8 +700,11 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                verify_reason,
            )
            return None
-        score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10))
-        return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
+        score = max(
+            combined_score,
+            min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
+        )
+        return scene, aligned_in_s, usable_duration_s, score, f"{reason}; {verify_reason}"

    kept = []
    for result in results:
@@ -745,7 +765,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                    if repair is None:
                        new_segments.append(segment)
                        continue
-                    repair_scene, aligned_in_s, score, repair_reason = repair
+                    repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                    if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate:
                        new_segments.append(segment)
                        continue
@@ -755,7 +775,8 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                        segment,
                        scene_id=repair_scene.scene_id,
                        in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + segment.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
+                        duration_s=usable_duration_s,
                        match_score=score,
                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                    ))
@@ -783,7 +804,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0):
                    repair = realign_window(beat, result.scene_id)
                    if repair is not None:
-                        repair_scene, aligned_in_s, score, repair_reason = repair
+                        repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                        if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate:
                            logger.info(
                                "Beat %d: realigned semantically valid long scene by motion/action window (%s)",
@@ -794,7 +815,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                                result,
                                scene_id=repair_scene.scene_id,
                                in_point_s=aligned_in_s,
-                                out_point_s=aligned_in_s + result.duration_s,
+                                out_point_s=aligned_in_s + usable_duration_s,
                                in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
                                match_score=score,
                                is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
@@ -817,13 +838,14 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                    if repair is None:
                        all_repaired = False
                        break
-                    scene, aligned_in_s, score, repair_reason = repair
+                    scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                    repair_reasons.append(repair_reason)
                    new_segments.append(replace(
                        segment,
                        scene_id=scene.scene_id,
                        in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + segment.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
+                        duration_s=usable_duration_s,
                        match_score=score,
                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                    ))
@@ -849,7 +871,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
            else:
                repair = realign_window(beat, result.scene_id)
                if repair is not None:
-                    scene, aligned_in_s, score, repair_reason = repair
+                    scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                    logger.info(
                        "Beat %d: realigned inside matched scene by vision action window (%s)",
                        result.beat_id,
@@ -859,7 +881,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                        result,
                        scene_id=scene.scene_id,
                        in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + result.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
                        in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
                        match_score=score,
                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,