From a5a84a914505badd0eb8b1c9a18a69115dae1dc1 Mon Sep 17 00:00:00 2001 From: Melbar Date: Sat, 2 May 2026 17:59:18 +0200 Subject: [PATCH] Use motion phase for in-scene timing --- README.md | 9 ++++-- cli.py | 12 ++++++-- src/cv/global_scan.py | 67 +++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 80 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f0207a1..098d57e 100644 --- a/README.md +++ b/README.md @@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls. +Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen. +Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob +zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich +wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit +nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt. Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst den Inpoint bestimmt. @@ -167,8 +172,8 @@ nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist. Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet -den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert, -wird der Treffer verworfen. +den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese +In-Scene-Reparatur scheitert, wird der Treffer verworfen. Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete diff --git a/cli.py b/cli.py index 18f68bb..76abeca 100644 --- a/cli.py +++ b/cli.py @@ -640,7 +640,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) from dataclasses import replace from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision from src.cv.scene_indexer import build_scene_index - from src.cv.global_scan import align_in_point_by_content + from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion logger = logging.getLogger(__name__) beats_by_id = {beat.beat_id: beat for beat in beats} @@ -655,12 +655,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) return None start_s, end_s, semantic_score, reason = found window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5)) - aligned_in_s, content_score = align_in_point_by_content( + motion_in_s, motion_score = align_in_point_by_motion( check_beat, start_s, cfg, search_window_s=window_s, ) + aligned_in_s, content_score = align_in_point_by_content( + check_beat, + motion_in_s, + cfg, + search_window_s=min(window_s, 0.8), + ) aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s))) ok, verify_reason = validate_match_window_with_vision( check_beat, @@ -679,7 +685,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg) verify_reason, ) return None - score = max(content_score, min(0.99, semantic_score)) + score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25)) return scene, aligned_in_s, score, f"{reason}; {verify_reason}" kept = [] diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py index 89b0930..1df7984 100644 --- a/src/cv/global_scan.py +++ b/src/cv/global_scan.py @@ -827,6 +827,50 @@ def _motion_phase_score( return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35) +def align_in_point_by_motion( + beat: TrailerBeat, + estimated_in_point_s: float, + cfg: AppConfig, + search_window_s: float | None = None, +) -> tuple[float, float]: + """ + Align a candidate by matching the frame-to-frame motion pattern. + + This catches the common failure mode where the right source scene is found, + but the in-point is a few seconds too early or late inside a repeated + conversation/action beat. + """ + motion_templates = _prepare_motion_templates(beat, cfg) + if len(motion_templates) < 2: + return estimated_in_point_s, 0.0 + + with open_video(cfg.paths.source_movie) as cap: + fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate + frame_step_s = 1.0 / fps + window_s = ( + search_window_s + if search_window_s is not None + else cfg.cv.deep_scan.content_align_window_seconds + ) + start_s = max(0.0, estimated_in_point_s - window_s) + end_s = estimated_in_point_s + window_s + tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta + + best_in = estimated_in_point_s + best_score = -1.0 + t = start_s + while t <= end_s: + score = _motion_phase_score(cap, t, motion_templates, cfg) + if score > best_score + tie_delta: + best_score = score + best_in = t + elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s): + best_in = t + t = round(t + frame_step_s, 6) + + return best_in, max(0.0, best_score) + + def estimate_usable_source_duration( beat: TrailerBeat, in_point_s: float, @@ -1190,6 +1234,7 @@ def run_global_scan( for _, coarse_score, in_point_s in reranked_candidates[:refine_limit] ] validation_templates = _prepare_validation_templates(b, cfg) + motion_templates = _prepare_motion_templates(b, cfg) logger.info( 'Beat %d: content-reranked top %d / %d candidates.', b.beat_id, @@ -1270,6 +1315,16 @@ def run_global_scan( if matchable_duration_s > 0 else 0.0 ) + motion_score = 0.0 + if len(motion_templates) >= 2: + with open_video(cfg.paths.source_movie) as motion_cap: + motion_score = _motion_phase_score( + motion_cap, + adjusted_in_s, + motion_templates, + cfg, + ) + if is_weighted_seed_candidate and scene is not None and content_score >= content_gate: contiguous_usable_s = _contiguous_scene_coverage_duration( b, @@ -1299,11 +1354,15 @@ def run_global_scan( final_score * (1.0 - scan_cfg.content_validation_weight) + content_score * scan_cfg.content_validation_weight ) + if len(motion_templates) >= 2: + motion_score_clamped = max(0.0, min(1.0, motion_score)) + final_score = final_score * 0.82 + motion_score_clamped * 0.18 if is_weighted_seed_candidate: vision_provisional_score = ( - content_score * 0.55 + content_score * 0.45 + duration_coverage * 0.33 + coarse_score * 0.12 + + max(0.0, min(1.0, motion_score)) * 0.10 ) final_score = max(final_score, vision_provisional_score) if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate: @@ -1332,7 +1391,7 @@ def run_global_scan( if duration_coverage < scan_cfg.min_duration_coverage: rejected_short_candidates += 1 logger.debug( - 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f', b.beat_id, adjusted_in_s, scene.scene_id if scene is not None else 'none', @@ -1340,6 +1399,7 @@ def run_global_scan( span_score, coarse_score, content_score, + motion_score, duration_coverage, final_score, ) @@ -1364,7 +1424,7 @@ def run_global_scan( continue logger.debug( - 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f', + 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f', b.beat_id, adjusted_in_s, scene.scene_id if scene is not None else 'none', @@ -1372,6 +1432,7 @@ def run_global_scan( span_score, coarse_score, content_score, + motion_score, duration_coverage, final_score, )