From a5a84a914505badd0eb8b1c9a18a69115dae1dc1 Mon Sep 17 00:00:00 2001
From: Melbar <tangshode@gmail.com>
Date: Sat, 2 May 2026 17:59:18 +0200
Subject: [PATCH] Use motion phase for in-scene timing

---
 README.md             |  9 ++++--
 cli.py                | 12 ++++++--
 src/cv/global_scan.py | 67 +++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index f0207a1..098d57e 100644
--- a/README.md
+++ b/README.md
@@ -137,6 +137,11 @@ Die Inpoint-Feinjustage bestimmt den Versatz lokal aus dem Bildinhalt: Um einen
 groben Treffer herum werden mehrere Referenzframes gegen mehrere Source-Offsets
 verglichen, und der beste gemeinsame Offset wird übernommen. Das ist schneller
 als ein erneuter globaler Scan und vermeidet pauschale Frame-Prerolls.
+Zusätzlich wird die Bewegungsphase über Frame-zu-Frame-Differenzen verglichen.
+Dadurch kann der Matcher innerhalb derselben Source-Szene unterscheiden, ob
+zwei Figuren noch sprechen, sich annähern, bereits im Kontakt sind oder sich
+wieder voneinander lösen. Ein optisch ähnlicher Standbild-Treffer reicht damit
+nicht mehr aus, wenn der Bewegungsverlauf nicht zur Referenz passt.
 Schwarze Referenzframes aus Blenden oder Titel-Tails werden für diese
 Offset-Messung ausgelassen, damit echte Bildbewegung und nicht die Blende selbst
 den Inpoint bestimmt.
@@ -167,8 +172,8 @@ nicht gespeichert, selbst wenn der Low-Level-CV-Score hoch ist.
 Wenn die Szene selbst plausibel ist, aber der konkrete Source-Zeitpunkt diese
 Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
 Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
-den Inpoint darauf neu aus. Erst wenn auch diese In-Scene-Reparatur scheitert,
-wird der Treffer verworfen.
+den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
+In-Scene-Reparatur scheitert, wird der Treffer verworfen.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
diff --git a/cli.py b/cli.py
index 18f68bb..76abeca 100644
--- a/cli.py
+++ b/cli.py
@@ -640,7 +640,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
     from dataclasses import replace
     from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
     from src.cv.scene_indexer import build_scene_index
-    from src.cv.global_scan import align_in_point_by_content
+    from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
 
     logger = logging.getLogger(__name__)
     beats_by_id = {beat.beat_id: beat for beat in beats}
@@ -655,12 +655,18 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
             return None
         start_s, end_s, semantic_score, reason = found
         window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
-        aligned_in_s, content_score = align_in_point_by_content(
+        motion_in_s, motion_score = align_in_point_by_motion(
             check_beat,
             start_s,
             cfg,
             search_window_s=window_s,
         )
+        aligned_in_s, content_score = align_in_point_by_content(
+            check_beat,
+            motion_in_s,
+            cfg,
+            search_window_s=min(window_s, 0.8),
+        )
         aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
         ok, verify_reason = validate_match_window_with_vision(
             check_beat,
@@ -679,7 +685,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                 verify_reason,
             )
             return None
-        score = max(content_score, min(0.99, semantic_score))
+        score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
         return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
 
     kept = []
diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py
index 89b0930..1df7984 100644
--- a/src/cv/global_scan.py
+++ b/src/cv/global_scan.py
@@ -827,6 +827,50 @@ def _motion_phase_score(
     return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35)
 
 
+def align_in_point_by_motion(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+) -> tuple[float, float]:
+    """
+    Align a candidate by matching the frame-to-frame motion pattern.
+
+    This catches the common failure mode where the right source scene is found,
+    but the in-point is a few seconds too early or late inside a repeated
+    conversation/action beat.
+    """
+    motion_templates = _prepare_motion_templates(beat, cfg)
+    if len(motion_templates) < 2:
+        return estimated_in_point_s, 0.0
+
+    with open_video(cfg.paths.source_movie) as cap:
+        fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
+        frame_step_s = 1.0 / fps
+        window_s = (
+            search_window_s
+            if search_window_s is not None
+            else cfg.cv.deep_scan.content_align_window_seconds
+        )
+        start_s = max(0.0, estimated_in_point_s - window_s)
+        end_s = estimated_in_point_s + window_s
+        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
+
+        best_in = estimated_in_point_s
+        best_score = -1.0
+        t = start_s
+        while t <= end_s:
+            score = _motion_phase_score(cap, t, motion_templates, cfg)
+            if score > best_score + tie_delta:
+                best_score = score
+                best_in = t
+            elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s):
+                best_in = t
+            t = round(t + frame_step_s, 6)
+
+    return best_in, max(0.0, best_score)
+
+
 def estimate_usable_source_duration(
     beat: TrailerBeat,
     in_point_s: float,
@@ -1190,6 +1234,7 @@ def run_global_scan(
                 for _, coarse_score, in_point_s in reranked_candidates[:refine_limit]
             ]
             validation_templates = _prepare_validation_templates(b, cfg)
+            motion_templates = _prepare_motion_templates(b, cfg)
             logger.info(
                 'Beat %d: content-reranked top %d / %d candidates.',
                 b.beat_id,
@@ -1270,6 +1315,16 @@ def run_global_scan(
                             if matchable_duration_s > 0 else 0.0
                         )
 
+                motion_score = 0.0
+                if len(motion_templates) >= 2:
+                    with open_video(cfg.paths.source_movie) as motion_cap:
+                        motion_score = _motion_phase_score(
+                            motion_cap,
+                            adjusted_in_s,
+                            motion_templates,
+                            cfg,
+                        )
+
                 if is_weighted_seed_candidate and scene is not None and content_score >= content_gate:
                     contiguous_usable_s = _contiguous_scene_coverage_duration(
                         b,
@@ -1299,11 +1354,15 @@ def run_global_scan(
                     final_score * (1.0 - scan_cfg.content_validation_weight)
                     + content_score * scan_cfg.content_validation_weight
                 )
+                if len(motion_templates) >= 2:
+                    motion_score_clamped = max(0.0, min(1.0, motion_score))
+                    final_score = final_score * 0.82 + motion_score_clamped * 0.18
                 if is_weighted_seed_candidate:
                     vision_provisional_score = (
-                        content_score * 0.55
+                        content_score * 0.45
                         + duration_coverage * 0.33
                         + coarse_score * 0.12
+                        + max(0.0, min(1.0, motion_score)) * 0.10
                     )
                     final_score = max(final_score, vision_provisional_score)
                 if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate:
@@ -1332,7 +1391,7 @@ def run_global_scan(
                 if duration_coverage < scan_cfg.min_duration_coverage:
                     rejected_short_candidates += 1
                     logger.debug(
-                        'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                        'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
                         b.beat_id,
                         adjusted_in_s,
                         scene.scene_id if scene is not None else 'none',
@@ -1340,6 +1399,7 @@ def run_global_scan(
                         span_score,
                         coarse_score,
                         content_score,
+                        motion_score,
                         duration_coverage,
                         final_score,
                     )
@@ -1364,7 +1424,7 @@ def run_global_scan(
                     continue
 
                 logger.debug(
-                    'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f coverage=%.2f final=%.3f',
+                    'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f',
                     b.beat_id,
                     adjusted_in_s,
                     scene.scene_id if scene is not None else 'none',
@@ -1372,6 +1432,7 @@ def run_global_scan(
                     span_score,
                     coarse_score,
                     content_score,
+                    motion_score,
                     duration_coverage,
                     final_score,
                 )