From 2cc05e47374c522014120f4d9b9ceba9f4937a6e Mon Sep 17 00:00:00 2001
From: Melbar <tangshode@gmail.com>
Date: Sat, 2 May 2026 23:04:41 +0200
Subject: [PATCH] Trim retimed segments when phase drifts

---
 README.md |  5 +++++
 cli.py    | 52 +++++++++++++++++++++++++++++++++++++---------------
 2 files changed, 42 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 6f79a94..8634809 100644
--- a/README.md
+++ b/README.md
@@ -191,6 +191,11 @@ sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den
 Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung
 eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende
 beginnt.
+Der Segment-Offset zählt dabei nur über vorherige scorebare Bildinseln, nicht
+über schwarze oder blendige Lücken. Nach dem Retiming wird die nutzbare
+Source-Dauer erneut geschätzt; läuft die Source am Ende in eine sichtbar andere
+Aktionsphase, wird der Clip gekürzt und der Rest bleibt Placeholder/Fade statt
+einen falschen Bewegungsmoment zu zeigen.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
diff --git a/cli.py b/cli.py
index c0be792..5b00d96 100644
--- a/cli.py
+++ b/cli.py
@@ -640,12 +640,24 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
     from dataclasses import replace
     from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
     from src.cv.scene_indexer import build_scene_index
-    from src.cv.global_scan import align_in_point_by_content_and_motion
+    from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration
 
     logger = logging.getLogger(__name__)
     beats_by_id = {beat.beat_id: beat for beat in beats}
     scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)}
 
+    def visible_content_offset(action_beat, segment_start_offset_s: float) -> float:
+        content_offset_s = 0.0
+        for start_s, end_s in _reference_scoreable_segments(action_beat, cfg):
+            if end_s <= segment_start_offset_s:
+                content_offset_s += max(0.0, end_s - start_s)
+            elif start_s < segment_start_offset_s:
+                content_offset_s += max(0.0, segment_start_offset_s - start_s)
+                break
+            else:
+                break
+        return content_offset_s
+
     def realign_window(check_beat, scene_id: int, action_beat=None):
         scene = scenes_by_id.get(scene_id)
         if scene is None:
@@ -655,9 +667,10 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
             return None
         start_s, end_s, semantic_score, reason = found
         if action_beat is not None:
-            offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s)
-            start_s += offset_delta_s
-            end_s += offset_delta_s
+            segment_start_offset_s = max(0.0, check_beat.start_s - action_beat.start_s)
+            content_offset_s = visible_content_offset(action_beat, segment_start_offset_s)
+            start_s += content_offset_s
+            end_s += content_offset_s
         window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
         aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
             check_beat,
@@ -666,12 +679,16 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
             search_window_s=window_s,
         )
         aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
+        usable_duration_s, usable_score = estimate_usable_source_duration(check_beat, aligned_in_s, cfg)
+        usable_duration_s = max(0.0, min(check_beat.duration_s, usable_duration_s))
+        if usable_duration_s < max(0.32, check_beat.duration_s * 0.45):
+            usable_duration_s = check_beat.duration_s
         ok, verify_reason = validate_match_window_with_vision(
             check_beat,
             source_path=scene.source_path,
             scene_id=scene.scene_id,
             in_point_s=aligned_in_s,
-            out_point_s=aligned_in_s + check_beat.duration_s,
+            out_point_s=aligned_in_s + usable_duration_s,
             cfg=cfg,
         )
         if not ok:
@@ -683,8 +700,11 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                 verify_reason,
             )
             return None
-        score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10))
-        return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
+        score = max(
+            combined_score,
+            min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08),
+        )
+        return scene, aligned_in_s, usable_duration_s, score, f"{reason}; {verify_reason}"
 
     kept = []
     for result in results:
@@ -745,7 +765,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                     if repair is None:
                         new_segments.append(segment)
                         continue
-                    repair_scene, aligned_in_s, score, repair_reason = repair
+                    repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                     if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate:
                         new_segments.append(segment)
                         continue
@@ -755,7 +775,8 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                         segment,
                         scene_id=repair_scene.scene_id,
                         in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + segment.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
+                        duration_s=usable_duration_s,
                         match_score=score,
                         is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                     ))
@@ -783,7 +804,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                 if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0):
                     repair = realign_window(beat, result.scene_id)
                     if repair is not None:
-                        repair_scene, aligned_in_s, score, repair_reason = repair
+                        repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                         if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate:
                             logger.info(
                                 "Beat %d: realigned semantically valid long scene by motion/action window (%s)",
@@ -794,7 +815,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                                 result,
                                 scene_id=repair_scene.scene_id,
                                 in_point_s=aligned_in_s,
-                                out_point_s=aligned_in_s + result.duration_s,
+                                out_point_s=aligned_in_s + usable_duration_s,
                                 in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
                                 match_score=score,
                                 is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
@@ -817,13 +838,14 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                     if repair is None:
                         all_repaired = False
                         break
-                    scene, aligned_in_s, score, repair_reason = repair
+                    scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                     repair_reasons.append(repair_reason)
                     new_segments.append(replace(
                         segment,
                         scene_id=scene.scene_id,
                         in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + segment.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
+                        duration_s=usable_duration_s,
                         match_score=score,
                         is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
                     ))
@@ -849,7 +871,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
             else:
                 repair = realign_window(beat, result.scene_id)
                 if repair is not None:
-                    scene, aligned_in_s, score, repair_reason = repair
+                    scene, aligned_in_s, usable_duration_s, score, repair_reason = repair
                     logger.info(
                         "Beat %d: realigned inside matched scene by vision action window (%s)",
                         result.beat_id,
@@ -859,7 +881,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                         result,
                         scene_id=scene.scene_id,
                         in_point_s=aligned_in_s,
-                        out_point_s=aligned_in_s + result.duration_s,
+                        out_point_s=aligned_in_s + usable_duration_s,
                         in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
                         match_score=score,
                         is_confirmed=score >= cfg.cv.deep_scan.match_threshold,