Retiming long scene matches by action phase

2026-05-02 20:47:59 +02:00
parent 252f710396
commit 8415516f89
3 changed files with 158 additions and 11 deletions
@@ -871,6 +871,75 @@ def align_in_point_by_motion(
    return best_in, max(0.0, best_score)


+def align_in_point_by_content_and_motion(
+    beat: TrailerBeat,
+    estimated_in_point_s: float,
+    cfg: AppConfig,
+    search_window_s: float | None = None,
+) -> tuple[float, float, float, float]:
+    """
+    Align a candidate using still-frame content and motion phase together.
+
+    Running content and motion as separate passes can overshoot short action
+    phases: one pass may land on the right broad gesture and the next can slide
+    to a visually similar but later posture. A joint score keeps the in-point
+    tied to the same frame hypothesis throughout the local search.
+    """
+    templates = _prepare_beat_templates(beat, cfg)
+    motion_templates = _prepare_motion_templates(beat, cfg)
+    if not templates:
+        return estimated_in_point_s, 0.0, 0.0, 0.0
+
+    with open_video(cfg.paths.source_movie) as cap:
+        fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate
+        frame_step_s = 1.0 / fps
+        window_s = (
+            search_window_s
+            if search_window_s is not None
+            else cfg.cv.deep_scan.content_align_window_seconds
+        )
+        start_s = max(0.0, estimated_in_point_s - window_s)
+        end_s = estimated_in_point_s + window_s
+        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta
+
+        best_in = estimated_in_point_s
+        best_score = -1.0
+        best_content = -1.0
+        best_motion = -1.0
+        t = start_s
+        while t <= end_s:
+            content_score = _content_alignment_score(cap, t, templates, cfg)
+            motion_score = (
+                _motion_phase_score(cap, t, motion_templates, cfg)
+                if len(motion_templates) >= 2
+                else content_score
+            )
+            if content_score < 0 or motion_score < 0:
+                t = round(t + frame_step_s, 6)
+                continue
+            raw_score = content_score * 0.64 + motion_score * 0.36
+            anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.05)
+            score = raw_score - anchor_penalty
+            if score > best_score + tie_delta:
+                best_score = score
+                best_in = t
+                best_content = content_score
+                best_motion = motion_score
+            elif score >= best_score - tie_delta:
+                current_distance = abs(t - estimated_in_point_s)
+                best_distance = abs(best_in - estimated_in_point_s)
+                if current_distance < best_distance or (
+                    abs(current_distance - best_distance) <= frame_step_s * 0.5
+                    and t < best_in
+                ):
+                    best_in = t
+                    best_content = content_score
+                    best_motion = motion_score
+            t = round(t + frame_step_s, 6)
+
+    return best_in, max(0.0, best_score), max(0.0, best_content), max(0.0, best_motion)
+
+
 def estimate_usable_source_duration(
    beat: TrailerBeat,
    in_point_s: float,