From 8415516f8900add1d73ade7b10823ed93ddbdea6 Mon Sep 17 00:00:00 2001
From: Melbar <tangshode@gmail.com>
Date: Sat, 2 May 2026 20:47:59 +0200
Subject: [PATCH] Retiming long scene matches by action phase

---
 README.md             |  9 +++++
 cli.py                | 91 +++++++++++++++++++++++++++++++++++++------
 src/cv/global_scan.py | 69 ++++++++++++++++++++++++++++++++
 3 files changed, 158 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index a40ba78..0c10105 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,10 @@ Aktionsphase verfehlt, sucht der Matcher automatisch dichter innerhalb derselben
 Source-Szene nach lokalen Vision-Fenstern mit der passenden Aktion und richtet
 den Inpoint mit der Motion-Phase-Prüfung darauf neu aus. Erst wenn auch diese
 In-Scene-Reparatur scheitert, wird der Treffer verworfen.
+Diese In-Scene-Reparatur läuft auch für semantisch gültige Treffer aus langen
+Source-Szenen. Dadurch kann ein grob passender Dialogmoment nicht bestehen
+bleiben, wenn ein anderes lokales Fenster derselben Szene die gesuchte
+Aktionsphase und Bewegung klarer trifft.
 Der gewichtete Vision-Seed-Pfad ersetzt standardmäßig keinen normalen
 FFmpeg-Vollscan. Vision-Beschreibungen sind semantische Hinweise, aber keine
 Beweise; der volle CV-Scan bleibt deshalb aktiv, damit falsch bewertete
@@ -200,6 +204,11 @@ Nach einem dichten Vision-Treffer darf der spätere lokale Aligner nur noch im
 Bereich dieses Scan-Schritts nachjustieren. So kann ein korrekt gefundener
 Bewegungsmoment nicht wieder um viele Frames in eine ähnlich aussehende Phase
 derselben Szene verschoben werden.
+Für Vision-Action-Fenster nutzt die finale Retiming-Prüfung eine gemeinsame
+Content-und-Motion-Suche pro Frame. Content und Bewegungsphase werden dabei
+nicht mehr als zwei getrennte Korrekturschritte angewendet; das verhindert,
+dass eine kurze Geste erst korrekt erkannt und anschließend in eine spätere
+ähnliche Körperhaltung verschoben wird.
 Wenn mehrere Vision-Kandidaten in derselben Source-Szene ähnlich gut scoren
 und die Beat-Dauer abdecken, bevorzugt der Matcher die frühere Phase. Das
 verhindert, dass ein späterer, minimal stärkerer Standbildtreffer die
diff --git a/cli.py b/cli.py
index 76abeca..69d3637 100644
--- a/cli.py
+++ b/cli.py
@@ -640,7 +640,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
     from dataclasses import replace
     from src.llm.vision_cache import find_action_window_in_scene, validate_match_window_with_vision
     from src.cv.scene_indexer import build_scene_index
-    from src.cv.global_scan import align_in_point_by_content, align_in_point_by_motion
+    from src.cv.global_scan import align_in_point_by_content_and_motion
 
     logger = logging.getLogger(__name__)
     beats_by_id = {beat.beat_id: beat for beat in beats}
@@ -654,19 +654,13 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
         if found is None:
             return None
         start_s, end_s, semantic_score, reason = found
-        window_s = max(1.0, min(4.0, (end_s - start_s) * 1.5))
-        motion_in_s, motion_score = align_in_point_by_motion(
+        window_s = max(3.0, min(8.0, (end_s - start_s) * 4.0))
+        aligned_in_s, combined_score, content_score, motion_score = align_in_point_by_content_and_motion(
             check_beat,
             start_s,
             cfg,
             search_window_s=window_s,
         )
-        aligned_in_s, content_score = align_in_point_by_content(
-            check_beat,
-            motion_in_s,
-            cfg,
-            search_window_s=min(window_s, 0.8),
-        )
         aligned_in_s = max(scene.start_s, min(aligned_in_s, max(scene.start_s, scene.end_s - check_beat.duration_s)))
         ok, verify_reason = validate_match_window_with_vision(
             check_beat,
@@ -685,7 +679,7 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                 verify_reason,
             )
             return None
-        score = max(content_score, min(0.99, semantic_score * 0.75 + motion_score * 0.25))
+        score = max(combined_score, min(0.99, semantic_score * 0.70 + motion_score * 0.20 + content_score * 0.10))
         return scene, aligned_in_s, score, f"{reason}; {verify_reason}"
 
     kept = []
@@ -728,7 +722,82 @@ def _filter_semantically_invalid_vision_matches(results: list, beats: list, cfg)
                 valid = False
                 break
         if valid:
-            kept.append(result)
+            repaired = False
+            if getattr(result, "segments", ()):
+                new_segments = []
+                repair_reasons = []
+                changed = False
+                for segment in result.segments:
+                    scene = scenes_by_id.get(segment.scene_id)
+                    if scene is None or scene.duration_s <= max(segment.duration_s * 1.6, 6.0):
+                        new_segments.append(segment)
+                        continue
+                    segment_beat = replace(
+                        beat,
+                        start_s=beat.start_s + segment.trailer_offset_s,
+                        end_s=beat.start_s + segment.trailer_offset_s + segment.duration_s,
+                    )
+                    repair = realign_window(segment_beat, segment.scene_id)
+                    if repair is None:
+                        new_segments.append(segment)
+                        continue
+                    repair_scene, aligned_in_s, score, repair_reason = repair
+                    if abs(aligned_in_s - segment.in_point_s) <= 1.0 / cfg.export.edl_frame_rate:
+                        new_segments.append(segment)
+                        continue
+                    changed = True
+                    repair_reasons.append(repair_reason)
+                    new_segments.append(replace(
+                        segment,
+                        scene_id=repair_scene.scene_id,
+                        in_point_s=aligned_in_s,
+                        out_point_s=aligned_in_s + segment.duration_s,
+                        match_score=score,
+                        is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
+                    ))
+                if changed and new_segments:
+                    first = new_segments[0]
+                    repaired_score = min(seg.match_score for seg in new_segments)
+                    logger.info(
+                        "Beat %d: realigned semantically valid long scene by motion/action windows (%s)",
+                        result.beat_id,
+                        "; ".join(repair_reasons),
+                    )
+                    kept.append(replace(
+                        result,
+                        scene_id=first.scene_id,
+                        in_point_s=first.in_point_s,
+                        out_point_s=first.out_point_s,
+                        in_point_frame=int(first.in_point_s * cfg.export.edl_frame_rate),
+                        match_score=repaired_score,
+                        is_confirmed=repaired_score >= cfg.cv.deep_scan.match_threshold,
+                        segments=tuple(new_segments),
+                    ))
+                    repaired = True
+            else:
+                scene = scenes_by_id.get(result.scene_id)
+                if scene is not None and scene.duration_s > max(result.duration_s * 1.6, 6.0):
+                    repair = realign_window(beat, result.scene_id)
+                    if repair is not None:
+                        repair_scene, aligned_in_s, score, repair_reason = repair
+                        if abs(aligned_in_s - result.in_point_s) > 1.0 / cfg.export.edl_frame_rate:
+                            logger.info(
+                                "Beat %d: realigned semantically valid long scene by motion/action window (%s)",
+                                result.beat_id,
+                                repair_reason,
+                            )
+                            kept.append(replace(
+                                result,
+                                scene_id=repair_scene.scene_id,
+                                in_point_s=aligned_in_s,
+                                out_point_s=aligned_in_s + result.duration_s,
+                                in_point_frame=int(aligned_in_s * cfg.export.edl_frame_rate),
+                                match_score=score,
+                                is_confirmed=score >= cfg.cv.deep_scan.match_threshold,
+                            ))
+                            repaired = True
+            if not repaired:
+                kept.append(result)
         else:
             if getattr(result, "segments", ()):
                 new_segments = []
diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py
index 1df7984..8b917d2 100644
--- a/src/cv/global_scan.py
+++ b/src/cv/global_scan.py
@@ -871,6 +871,75 @@ def align_in_point_by_motion(
     return best_in, max(0.0, best_score)
 
 
+def align_in_point_by_content_and_motion(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+) -> tuple[float, float, float, float]:
+    """
+    Align a candidate using still-frame content and motion phase together.
+
+    Running content and motion as separate passes can overshoot short action
+    phases: one pass may land on the right broad gesture and the next can slide
+    to a visually similar but later posture. A joint score keeps the in-point
+    tied to the same frame hypothesis throughout the local search.
+    """
+    templates = _prepare_beat_templates(beat, cfg)
+    motion_templates = _prepare_motion_templates(beat, cfg)
+    if not templates:
+        return estimated_in_point_s, 0.0, 0.0, 0.0
+
+    with open_video(cfg.paths.source_movie) as cap:
+        fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
+        frame_step_s = 1.0 / fps
+        window_s = (
+            search_window_s
+            if search_window_s is not None
+            else cfg.cv.deep_scan.content_align_window_seconds
+        )
+        start_s = max(0.0, estimated_in_point_s - window_s)
+        end_s = estimated_in_point_s + window_s
+        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
+
+        best_in = estimated_in_point_s
+        best_score = -1.0
+        best_content = -1.0
+        best_motion = -1.0
+        t = start_s
+        while t <= end_s:
+            content_score = _content_alignment_score(cap, t, templates, cfg)
+            motion_score = (
+                _motion_phase_score(cap, t, motion_templates, cfg)
+                if len(motion_templates) >= 2
+                else content_score
+            )
+            if content_score < 0 or motion_score < 0:
+                t = round(t + frame_step_s, 6)
+                continue
+            raw_score = content_score * 0.64 + motion_score * 0.36
+            anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.05)
+            score = raw_score - anchor_penalty
+            if score > best_score + tie_delta:
+                best_score = score
+                best_in = t
+                best_content = content_score
+                best_motion = motion_score
+            elif score >= best_score - tie_delta:
+                current_distance = abs(t - estimated_in_point_s)
+                best_distance = abs(best_in - estimated_in_point_s)
+                if current_distance < best_distance or (
+                    abs(current_distance - best_distance) <= frame_step_s * 0.5
+                    and t < best_in
+                ):
+                    best_in = t
+                    best_content = content_score
+                    best_motion = motion_score
+            t = round(t + frame_step_s, 6)
+
+    return best_in, max(0.0, best_score), max(0.0, best_content), max(0.0, best_motion)
+
+
 def estimate_usable_source_duration(
     beat: TrailerBeat,
     in_point_s: float,