diff --git a/src/cv/global_scan.py b/src/cv/global_scan.py index da24211..1b97b52 100644 --- a/src/cv/global_scan.py +++ b/src/cv/global_scan.py @@ -260,12 +260,21 @@ def _fixed_content_sequence_score( in_point_s: float, templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]], cfg: AppConfig, + max_source_duration_s: float | None = None, ) -> float: if not templates: return 0.0 + active = ( + [(off, feats) for off, feats in templates if off <= max_source_duration_s] + if max_source_duration_s is not None and max_source_duration_s > 0 + else templates + ) + if not active: + return 0.0 + scores: list[float] = [] - for offset_s, ref_features in templates: + for offset_s, ref_features in active: frame = grab_frame_at(cap, in_point_s + offset_s) if frame is None: return 0.0 @@ -802,6 +811,7 @@ def align_in_point_by_content( estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, + scene_end_s: float | None = None, ) -> tuple[float, float]: """ Find the frame offset directly from image content around a rough match. @@ -809,6 +819,10 @@ def align_in_point_by_content( This is deliberately local: once a candidate shot is plausible, scanning a small window around it with many reference frames is faster and more robust than repeating a global scan or applying a fixed frame preroll. + + scene_end_s: when provided, templates are dynamically filtered per candidate + to only offsets that stay within the source scene, preventing cross-boundary + frame reads from dragging scores into wrong-content territory. """ templates = _content_alignment_templates(beat, cfg) if not templates: @@ -830,7 +844,15 @@ def align_in_point_by_content( best_score = -1.0 t = start_s while t <= end_s: - score = _content_alignment_score(cap, t, templates, cfg) + if scene_end_s is not None: + avail_s = scene_end_s - t + if avail_s > 0: + active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s] + else: + active_templates = [] + else: + active_templates = templates + score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0 if score > best_score + tie_delta: best_score = score best_in = t @@ -981,6 +1003,7 @@ def estimate_usable_source_duration( cfg: AppConfig, sample_step_s: float | None = None, min_keep_s: float = 0.5, + scene_end_s: float | None = None, ) -> tuple[float, float]: """ Estimate how long the source stays visually aligned with the beat. @@ -1032,6 +1055,17 @@ def estimate_usable_source_duration( break tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps) + + if scene_end_s is not None: + avail_s = scene_end_s - in_point_s + if avail_s > 0 and last_good >= avail_s - (step_s * 2): + logger.info('Beat %d: Boundary hit: last_good=%.3f, avail_s=%.3f, step_s=%.3f. Disabling tail safety.', beat.beat_id, last_good, avail_s, step_s) + tail_safety_s = 0.0 + last_good = max(last_good, avail_s) + else: + if avail_s > 0: + logger.info('Beat %d: Boundary NOT hit: last_good=%.3f, avail_s=%.3f, thresh=%.3f', beat.beat_id, last_good, avail_s, avail_s - (step_s * 2)) + usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s)) if usable < min_keep_s and scores: usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s)) @@ -1071,6 +1105,7 @@ def refine_in_point_with_sequence( estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, + scene_end_s: float | None = None, ) -> tuple[float, float]: """ Refine a rough source in-point by comparing several frames across the beat. @@ -1078,7 +1113,7 @@ def refine_in_point_with_sequence( Returns: (best_in_point_s, sequence_score) """ - return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s) + return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s) def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig): @@ -1357,23 +1392,51 @@ def run_global_scan( midpoint_t = coarse_in_s + (b.duration_s / 2) fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg) rough_in_s = max(0.0, fine_t - (b.duration_s / 2)) + # Don't let midpoint refinement jump to a different scene — a + # scene boundary crossed here means a wrong-content frame won + # the template match. Revert so the coarse candidate retains + # its original scene context. + if scenes: + coarse_scene_check = _find_scene_for_time(scenes, coarse_in_s, cfg) + rough_scene_check = _find_scene_for_time(scenes, rough_in_s, cfg) + if ( + coarse_scene_check is not None + and rough_scene_check is not None + and coarse_scene_check.scene_id != rough_scene_check.scene_id + ): + rough_in_s = coarse_in_s local_align_window_s = ( min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds) if is_weighted_seed_candidate else None ) + # When rough_in_s is near a scene boundary, filter templates to + # stay within the available source window so cross-boundary frames + # from unrelated content don't corrupt alignment and scoring. + rough_scene_for_boundary = _find_scene_for_time(scenes, rough_in_s, cfg) if scenes else None + rough_scene_end_s: float | None = None + if rough_scene_for_boundary is not None and matchable_duration_s > 0: + _avail = float(rough_scene_for_boundary.end_s) - rough_in_s + if 0 < _avail < matchable_duration_s: + rough_scene_end_s = float(rough_scene_for_boundary.end_s) refined_in_s, sequence_score = refine_in_point_with_sequence( b, rough_in_s, cfg, search_window_s=local_align_window_s, + scene_end_s=rough_scene_end_s, ) scene = _find_scene_for_time(scenes, refined_in_s, cfg) scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg) adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene) scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) - usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg) + usable_duration_s, span_score = estimate_usable_source_duration( + b, + adjusted_in_s, + cfg, + scene_end_s=float(scene.end_s) if scene is not None else None + ) out_s = adjusted_in_s + usable_duration_s if scene is not None: out_s = min(out_s, scene.end_s) @@ -1385,6 +1448,7 @@ def run_global_scan( adjusted_in_s, validation_templates, cfg, + max_source_duration_s=duration_s if rough_scene_end_s is not None else None, ) content_score = original_content_score content_in_s, align_content_score = align_in_point_by_content( @@ -1396,6 +1460,7 @@ def run_global_scan( if local_align_window_s is not None else min(0.8, cfg.cv.deep_scan.content_align_window_seconds) ), + scene_end_s=rough_scene_end_s, ) if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds: with open_video(cfg.paths.source_movie) as validation_cap: @@ -1404,6 +1469,7 @@ def run_global_scan( content_in_s, validation_templates, cfg, + max_source_duration_s=duration_s if rough_scene_end_s is not None else None, ) if aligned_content_score >= original_content_score + 0.01: adjusted_in_s = content_in_s @@ -1459,6 +1525,7 @@ def run_global_scan( adjusted_in_s, validation_templates, cfg, + max_source_duration_s=duration_s if rough_scene_end_s is not None else None, ) else: motion_score = original_motion_score