Fix: prevent tail-trimming of valid matches at hard scene boundaries in global_scan.py

2026-05-06 19:06:33 +02:00
parent 72e22969b4
commit c972894972
1 changed files with 71 additions and 4 deletions
@@ -260,12 +260,21 @@ def _fixed_content_sequence_score(
    in_point_s: float,
    templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]],
    cfg: AppConfig,
+    max_source_duration_s: float | None = None,
 ) -> float:
    if not templates:
        return 0.0

+    active = (
+        [(off, feats) for off, feats in templates if off <= max_source_duration_s]
+        if max_source_duration_s is not None and max_source_duration_s > 0
+        else templates
+    )
+    if not active:
+        return 0.0
+
    scores: list[float] = []
-    for offset_s, ref_features in templates:
+    for offset_s, ref_features in active:
        frame = grab_frame_at(cap, in_point_s + offset_s)
        if frame is None:
            return 0.0
@@ -802,6 +811,7 @@ def align_in_point_by_content(
    estimated_in_point_s: float,
    cfg: AppConfig,
    search_window_s: float | None = None,
+    scene_end_s: float | None = None,
 ) -> tuple[float, float]:
    """
    Find the frame offset directly from image content around a rough match.
@@ -809,6 +819,10 @@ def align_in_point_by_content(
    This is deliberately local: once a candidate shot is plausible, scanning a
    small window around it with many reference frames is faster and more robust
    than repeating a global scan or applying a fixed frame preroll.
+
+    scene_end_s: when provided, templates are dynamically filtered per candidate
+    to only offsets that stay within the source scene, preventing cross-boundary
+    frame reads from dragging scores into wrong-content territory.
    """
    templates = _content_alignment_templates(beat, cfg)
    if not templates:
@@ -830,7 +844,15 @@ def align_in_point_by_content(
        best_score = -1.0
        t = start_s
        while t <= end_s:
-            score = _content_alignment_score(cap, t, templates, cfg)
+            if scene_end_s is not None:
+                avail_s = scene_end_s - t
+                if avail_s > 0:
+                    active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s]
+                else:
+                    active_templates = []
+            else:
+                active_templates = templates
+            score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0
            if score > best_score + tie_delta:
                best_score = score
                best_in = t
@@ -981,6 +1003,7 @@ def estimate_usable_source_duration(
    cfg: AppConfig,
    sample_step_s: float | None = None,
    min_keep_s: float = 0.5,
+    scene_end_s: float | None = None,
 ) -> tuple[float, float]:
    """
    Estimate how long the source stays visually aligned with the beat.
@@ -1032,6 +1055,17 @@ def estimate_usable_source_duration(
            break

    tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
+    
+    if scene_end_s is not None:
+        avail_s = scene_end_s - in_point_s
+        if avail_s > 0 and last_good >= avail_s - (step_s * 2):
+            logger.info('Beat %d: Boundary hit: last_good=%.3f, avail_s=%.3f, step_s=%.3f. Disabling tail safety.', beat.beat_id, last_good, avail_s, step_s)
+            tail_safety_s = 0.0
+            last_good = max(last_good, avail_s)
+        else:
+            if avail_s > 0:
+                logger.info('Beat %d: Boundary NOT hit: last_good=%.3f, avail_s=%.3f, thresh=%.3f', beat.beat_id, last_good, avail_s, avail_s - (step_s * 2))
+
    usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s))
    if usable < min_keep_s and scores:
        usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s))
@@ -1071,6 +1105,7 @@ def refine_in_point_with_sequence(
    estimated_in_point_s: float,
    cfg: AppConfig,
    search_window_s: float | None = None,
+    scene_end_s: float | None = None,
 ) -> tuple[float, float]:
    """
    Refine a rough source in-point by comparing several frames across the beat.
@@ -1078,7 +1113,7 @@ def refine_in_point_with_sequence(
    Returns:
        (best_in_point_s, sequence_score)
    """
-    return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s)
+    return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s)


 def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
@@ -1357,23 +1392,51 @@ def run_global_scan(
                    midpoint_t = coarse_in_s + (b.duration_s / 2)
                    fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg)
                    rough_in_s = max(0.0, fine_t - (b.duration_s / 2))
+                    # Don't let midpoint refinement jump to a different scene — a
+                    # scene boundary crossed here means a wrong-content frame won
+                    # the template match.  Revert so the coarse candidate retains
+                    # its original scene context.
+                    if scenes:
+                        coarse_scene_check = _find_scene_for_time(scenes, coarse_in_s, cfg)
+                        rough_scene_check = _find_scene_for_time(scenes, rough_in_s, cfg)
+                        if (
+                            coarse_scene_check is not None
+                            and rough_scene_check is not None
+                            and coarse_scene_check.scene_id != rough_scene_check.scene_id
+                        ):
+                            rough_in_s = coarse_in_s
                local_align_window_s = (
                    min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds)
                    if is_weighted_seed_candidate
                    else None
                )
+                # When rough_in_s is near a scene boundary, filter templates to
+                # stay within the available source window so cross-boundary frames
+                # from unrelated content don't corrupt alignment and scoring.
+                rough_scene_for_boundary = _find_scene_for_time(scenes, rough_in_s, cfg) if scenes else None
+                rough_scene_end_s: float | None = None
+                if rough_scene_for_boundary is not None and matchable_duration_s > 0:
+                    _avail = float(rough_scene_for_boundary.end_s) - rough_in_s
+                    if 0 < _avail < matchable_duration_s:
+                        rough_scene_end_s = float(rough_scene_for_boundary.end_s)
                refined_in_s, sequence_score = refine_in_point_with_sequence(
                    b,
                    rough_in_s,
                    cfg,
                    search_window_s=local_align_window_s,
+                    scene_end_s=rough_scene_end_s,
                )
                scene = _find_scene_for_time(scenes, refined_in_s, cfg)
                scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps
                adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg)
                adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene)
                scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
-                usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg)
+                usable_duration_s, span_score = estimate_usable_source_duration(
+                    b, 
+                    adjusted_in_s, 
+                    cfg,
+                    scene_end_s=float(scene.end_s) if scene is not None else None
+                )
                out_s = adjusted_in_s + usable_duration_s
                if scene is not None:
                    out_s = min(out_s, scene.end_s)
@@ -1385,6 +1448,7 @@ def run_global_scan(
                        adjusted_in_s,
                        validation_templates,
                        cfg,
+                        max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
                    )
                content_score = original_content_score
                content_in_s, align_content_score = align_in_point_by_content(
@@ -1396,6 +1460,7 @@ def run_global_scan(
                        if local_align_window_s is not None
                        else min(0.8, cfg.cv.deep_scan.content_align_window_seconds)
                    ),
+                    scene_end_s=rough_scene_end_s,
                )
                if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds:
                    with open_video(cfg.paths.source_movie) as validation_cap:
@@ -1404,6 +1469,7 @@ def run_global_scan(
                            content_in_s,
                            validation_templates,
                            cfg,
+                            max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
                        )
                    if aligned_content_score >= original_content_score + 0.01:
                        adjusted_in_s = content_in_s
@@ -1459,6 +1525,7 @@ def run_global_scan(
                                adjusted_in_s,
                                validation_templates,
                                cfg,
+                                max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
                            )
                    else:
                        motion_score = original_motion_score