From 2cc05e47374c522014120f4d9b9ceba9f4937a6e Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 23:04:41 +0200 Subject: [PATCH] Trim retimed segments when phase drifts --- README.md | 5 +++++ cli.py | 52 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6f79a94..8634809 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,11 @@ sichtbare Segment begrenzt; der gefundene Source-Inpoint wird dabei um den Trailer-Offset des Segments verschoben. So geht die globale Aktionsbeschreibung eines Beats nicht verloren, nur weil der scorebare Teil erst nach einer Blende beginnt. +Der Segment-Offset zählt dabei nur über vorherige scorebare Bildinseln, nicht +über schwarze oder blendige Lücken. Nach dem Retiming wird die nutzbare +Source-Dauer erneut geschätzt; läuft die Source am Ende in eine sichtbar andere +Aktionsphase, wird der Clip gekürzt und der Rest bleibt Placeholder/Fade statt +einen falschen Bewegungsmoment zu zeigen. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete diff --git a/cli.py b/cli.py index c0be792..5b00d96 100644 --- a/cli.py +++ b/cli.py @@ -640,12 +640,24 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) from dataclasses import replace from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision from src.cv.scene_indexer import build_scene_index - from src.cv.global_scan import align_in_point_by_content_and_motion + from src.cv.global_scan import align_in_point_by_content_and_motion, estimate_usable_source_duration logger = logging.getLogger(__name__) beats_by_id = {beat.beat_id: beat for beat in beats} scenes_by_id = {scene.scene_id: scene for scene in build_scene_index(cfg)} + def visible_content_offset(action_beat, segment_start_offset_s: float) -> float: + content_offset_s = 0.0 + for start_s, end_s in _reference_scoreable_segments(action_beat, cfg): + if end_s <= segment_start_offset_s: + content_offset_s += max(0.0, end_s - start_s) + elif start_s < segment_start_offset_s: + content_offset_s += max(0.0, segment_start_offset_s - start_s) + break + else: + break + return content_offset_s + def realign_window(check_beat, scene_id: int, action_beat=None): scene = scenes_by_id.get(scene_id) if scene is None: @@ -655,9 +667,10 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) return None start_s, end_s, semantic_score, reason = found if action_beat is not None: - offset_delta_s = max(0.0, check_beat.start_s - action_beat.start_s) - start_s += offset_delta_s - end_s += offset_delta_s + segment_start_offset_s = max(0.0, check_beat.start_s - action_beat.start_s) + content_offset_s = visible_content_offset(action_beat, segment_start_offset_s) + start_s += content_offset_s + end_s += content_offset_s window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0)) aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion( check_beat, @@ -666,12 +679,16 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) search_window_s=window_s, ) aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s))) + usable_duration_s, usable_score = estimate_usable_source_duration(check_beat, aligned_in_s, cfg) + usable_duration_s = max(0.0, min(check_beat.duration_s, usable_duration_s)) + if usable_duration_s < max(0.32, check_beat.duration_s * 0.45): + usable_duration_s = check_beat.duration_s ok, verify_reason = validate_match_window_with_vision( check_beat, source_path=scene.source_path, scene_id=scene.scene_id, in_point_s=aligned_in_s, - out_point_s=aligned_in_s + check_beat.duration_s, + out_point_s=aligned_in_s + usable_duration_s, cfg=cfg, ) if not ok: @@ -683,8 +700,11 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) verify_reason, ) return None - score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10)) - return scene, aligned_in_s, score, f"{reason}; {verify_reason}" + score = max( + combined_score, + min(0.99, semantic_score * 0.65 + motion_score * 0.18 + content_score * 0.09 + usable_score * 0.08), + ) + return scene, aligned_in_s, usable_duration_s, score, f"{reason}; {verify_reason}" kept = [] for result in results: @@ -745,7 +765,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) if repair is None: new_segments.append(segment) continue - repair_scene, aligned_in_s, score, repair_reason = repair + repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate: new_segments.append(segment) continue @@ -755,7 +775,8 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) segment, scene_id=repair_scene.scene_id, in_point_s=aligned_in_s, - out_point_s=aligned_in_s + segment.duration_s, + out_point_s=aligned_in_s + usable_duration_s, + duration_s=usable_duration_s, match_score=score, is_confirmed=score >= cfg.cv.deep_scan.match_threshold, )) @@ -783,7 +804,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0): repair = realign_window(beat, result.scene_id) if repair is not None: - repair_scene, aligned_in_s, score, repair_reason = repair + repair_scene, aligned_in_s, usable_duration_s, score, repair_reason = repair if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate: logger.info( "Beat %d: realigned semantically valid long scene by motion/action window (%s)", @@ -794,7 +815,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) result, scene_id=repair_scene.scene_id, in_point_s=aligned_in_s, - out_point_s=aligned_in_s + result.duration_s, + out_point_s=aligned_in_s + usable_duration_s, in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), match_score=score, is_confirmed=score >= cfg.cv.deep_scan.match_threshold, @@ -817,13 +838,14 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) if repair is None: all_repaired = False break - scene, aligned_in_s, score, repair_reason = repair + scene, aligned_in_s, usable_duration_s, score, repair_reason = repair repair_reasons.append(repair_reason) new_segments.append(replace( segment, scene_id=scene.scene_id, in_point_s=aligned_in_s, - out_point_s=aligned_in_s + segment.duration_s, + out_point_s=aligned_in_s + usable_duration_s, + duration_s=usable_duration_s, match_score=score, is_confirmed=score >= cfg.cv.deep_scan.match_threshold, )) @@ -849,7 +871,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) else: repair = realign_window(beat, result.scene_id) if repair is not None: - scene, aligned_in_s, score, repair_reason = repair + scene, aligned_in_s, usable_duration_s, score, repair_reason = repair logger.info( "Beat %d: realigned inside matched scene by vision action window (%s)", result.beat_id, @@ -859,7 +881,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) result, scene_id=scene.scene_id, in_point_s=aligned_in_s, - out_point_s=aligned_in_s + result.duration_s, + out_point_s=aligned_in_s + usable_duration_s, in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate), match_score=score, is_confirmed=score >= cfg.cv.deep_scan.match_threshold,