Fix: prevent tail-trimming of valid matches at hard scene boundaries in global_scan.py

This commit is contained in:
Melbar
2026-05-06 19:06:33 +02:00
parent 72e22969b4
commit c972894972
+71 -4
View File
@@ -260,12 +260,21 @@ def _fixed_content_sequence_score(
in_point_s: float,
templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]],
cfg: AppConfig,
max_source_duration_s: float | None = None,
) -> float:
if not templates:
return 0.0
active = (
[(off, feats) for off, feats in templates if off <= max_source_duration_s]
if max_source_duration_s is not None and max_source_duration_s > 0
else templates
)
if not active:
return 0.0
scores: list[float] = []
for offset_s, ref_features in templates:
for offset_s, ref_features in active:
frame = grab_frame_at(cap, in_point_s + offset_s)
if frame is None:
return 0.0
@@ -802,6 +811,7 @@ def align_in_point_by_content(
estimated_in_point_s: float,
cfg: AppConfig,
search_window_s: float | None = None,
scene_end_s: float | None = None,
) -> tuple[float, float]:
"""
Find the frame offset directly from image content around a rough match.
@@ -809,6 +819,10 @@ def align_in_point_by_content(
This is deliberately local: once a candidate shot is plausible, scanning a
small window around it with many reference frames is faster and more robust
than repeating a global scan or applying a fixed frame preroll.
scene_end_s: when provided, templates are dynamically filtered per candidate
to only offsets that stay within the source scene, preventing cross-boundary
frame reads from dragging scores into wrong-content territory.
"""
templates = _content_alignment_templates(beat, cfg)
if not templates:
@@ -830,7 +844,15 @@ def align_in_point_by_content(
best_score = -1.0
t = start_s
while t <= end_s:
score = _content_alignment_score(cap, t, templates, cfg)
if scene_end_s is not None:
avail_s = scene_end_s - t
if avail_s > 0:
active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s]
else:
active_templates = []
else:
active_templates = templates
score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0
if score > best_score + tie_delta:
best_score = score
best_in = t
@@ -981,6 +1003,7 @@ def estimate_usable_source_duration(
cfg: AppConfig,
sample_step_s: float | None = None,
min_keep_s: float = 0.5,
scene_end_s: float | None = None,
) -> tuple[float, float]:
"""
Estimate how long the source stays visually aligned with the beat.
@@ -1032,6 +1055,17 @@ def estimate_usable_source_duration(
break
tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
if scene_end_s is not None:
avail_s = scene_end_s - in_point_s
if avail_s > 0 and last_good >= avail_s - (step_s * 2):
logger.info('Beat %d: Boundary hit: last_good=%.3f, avail_s=%.3f, step_s=%.3f. Disabling tail safety.', beat.beat_id, last_good, avail_s, step_s)
tail_safety_s = 0.0
last_good = max(last_good, avail_s)
else:
if avail_s > 0:
logger.info('Beat %d: Boundary NOT hit: last_good=%.3f, avail_s=%.3f, thresh=%.3f', beat.beat_id, last_good, avail_s, avail_s - (step_s * 2))
usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s))
if usable < min_keep_s and scores:
usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s))
@@ -1071,6 +1105,7 @@ def refine_in_point_with_sequence(
estimated_in_point_s: float,
cfg: AppConfig,
search_window_s: float | None = None,
scene_end_s: float | None = None,
) -> tuple[float, float]:
"""
Refine a rough source in-point by comparing several frames across the beat.
@@ -1078,7 +1113,7 @@ def refine_in_point_with_sequence(
Returns:
(best_in_point_s, sequence_score)
"""
return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s)
return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s)
def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
@@ -1357,23 +1392,51 @@ def run_global_scan(
midpoint_t = coarse_in_s + (b.duration_s / 2)
fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg)
rough_in_s = max(0.0, fine_t - (b.duration_s / 2))
# Don't let midpoint refinement jump to a different scene — a
# scene boundary crossed here means a wrong-content frame won
# the template match. Revert so the coarse candidate retains
# its original scene context.
if scenes:
coarse_scene_check = _find_scene_for_time(scenes, coarse_in_s, cfg)
rough_scene_check = _find_scene_for_time(scenes, rough_in_s, cfg)
if (
coarse_scene_check is not None
and rough_scene_check is not None
and coarse_scene_check.scene_id != rough_scene_check.scene_id
):
rough_in_s = coarse_in_s
local_align_window_s = (
min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds)
if is_weighted_seed_candidate
else None
)
# When rough_in_s is near a scene boundary, filter templates to
# stay within the available source window so cross-boundary frames
# from unrelated content don't corrupt alignment and scoring.
rough_scene_for_boundary = _find_scene_for_time(scenes, rough_in_s, cfg) if scenes else None
rough_scene_end_s: float | None = None
if rough_scene_for_boundary is not None and matchable_duration_s > 0:
_avail = float(rough_scene_for_boundary.end_s) - rough_in_s
if 0 < _avail < matchable_duration_s:
rough_scene_end_s = float(rough_scene_for_boundary.end_s)
refined_in_s, sequence_score = refine_in_point_with_sequence(
b,
rough_in_s,
cfg,
search_window_s=local_align_window_s,
scene_end_s=rough_scene_end_s,
)
scene = _find_scene_for_time(scenes, refined_in_s, cfg)
scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps
adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg)
adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene)
scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg)
usable_duration_s, span_score = estimate_usable_source_duration(
b,
adjusted_in_s,
cfg,
scene_end_s=float(scene.end_s) if scene is not None else None
)
out_s = adjusted_in_s + usable_duration_s
if scene is not None:
out_s = min(out_s, scene.end_s)
@@ -1385,6 +1448,7 @@ def run_global_scan(
adjusted_in_s,
validation_templates,
cfg,
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
)
content_score = original_content_score
content_in_s, align_content_score = align_in_point_by_content(
@@ -1396,6 +1460,7 @@ def run_global_scan(
if local_align_window_s is not None
else min(0.8, cfg.cv.deep_scan.content_align_window_seconds)
),
scene_end_s=rough_scene_end_s,
)
if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds:
with open_video(cfg.paths.source_movie) as validation_cap:
@@ -1404,6 +1469,7 @@ def run_global_scan(
content_in_s,
validation_templates,
cfg,
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
)
if aligned_content_score >= original_content_score + 0.01:
adjusted_in_s = content_in_s
@@ -1459,6 +1525,7 @@ def run_global_scan(
adjusted_in_s,
validation_templates,
cfg,
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
)
else:
motion_score = original_motion_score