Fix: prevent tail-trimming of valid matches at hard scene boundaries in global_scan.py
This commit is contained in:
+71
-4
@@ -260,12 +260,21 @@ def _fixed_content_sequence_score(
|
|||||||
in_point_s: float,
|
in_point_s: float,
|
||||||
templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]],
|
templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]],
|
||||||
cfg: AppConfig,
|
cfg: AppConfig,
|
||||||
|
max_source_duration_s: float | None = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
if not templates:
|
if not templates:
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
|
active = (
|
||||||
|
[(off, feats) for off, feats in templates if off <= max_source_duration_s]
|
||||||
|
if max_source_duration_s is not None and max_source_duration_s > 0
|
||||||
|
else templates
|
||||||
|
)
|
||||||
|
if not active:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
scores: list[float] = []
|
scores: list[float] = []
|
||||||
for offset_s, ref_features in templates:
|
for offset_s, ref_features in active:
|
||||||
frame = grab_frame_at(cap, in_point_s + offset_s)
|
frame = grab_frame_at(cap, in_point_s + offset_s)
|
||||||
if frame is None:
|
if frame is None:
|
||||||
return 0.0
|
return 0.0
|
||||||
@@ -802,6 +811,7 @@ def align_in_point_by_content(
|
|||||||
estimated_in_point_s: float,
|
estimated_in_point_s: float,
|
||||||
cfg: AppConfig,
|
cfg: AppConfig,
|
||||||
search_window_s: float | None = None,
|
search_window_s: float | None = None,
|
||||||
|
scene_end_s: float | None = None,
|
||||||
) -> tuple[float, float]:
|
) -> tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Find the frame offset directly from image content around a rough match.
|
Find the frame offset directly from image content around a rough match.
|
||||||
@@ -809,6 +819,10 @@ def align_in_point_by_content(
|
|||||||
This is deliberately local: once a candidate shot is plausible, scanning a
|
This is deliberately local: once a candidate shot is plausible, scanning a
|
||||||
small window around it with many reference frames is faster and more robust
|
small window around it with many reference frames is faster and more robust
|
||||||
than repeating a global scan or applying a fixed frame preroll.
|
than repeating a global scan or applying a fixed frame preroll.
|
||||||
|
|
||||||
|
scene_end_s: when provided, templates are dynamically filtered per candidate
|
||||||
|
to only offsets that stay within the source scene, preventing cross-boundary
|
||||||
|
frame reads from dragging scores into wrong-content territory.
|
||||||
"""
|
"""
|
||||||
templates = _content_alignment_templates(beat, cfg)
|
templates = _content_alignment_templates(beat, cfg)
|
||||||
if not templates:
|
if not templates:
|
||||||
@@ -830,7 +844,15 @@ def align_in_point_by_content(
|
|||||||
best_score = -1.0
|
best_score = -1.0
|
||||||
t = start_s
|
t = start_s
|
||||||
while t <= end_s:
|
while t <= end_s:
|
||||||
score = _content_alignment_score(cap, t, templates, cfg)
|
if scene_end_s is not None:
|
||||||
|
avail_s = scene_end_s - t
|
||||||
|
if avail_s > 0:
|
||||||
|
active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s]
|
||||||
|
else:
|
||||||
|
active_templates = []
|
||||||
|
else:
|
||||||
|
active_templates = templates
|
||||||
|
score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0
|
||||||
if score > best_score + tie_delta:
|
if score > best_score + tie_delta:
|
||||||
best_score = score
|
best_score = score
|
||||||
best_in = t
|
best_in = t
|
||||||
@@ -981,6 +1003,7 @@ def estimate_usable_source_duration(
|
|||||||
cfg: AppConfig,
|
cfg: AppConfig,
|
||||||
sample_step_s: float | None = None,
|
sample_step_s: float | None = None,
|
||||||
min_keep_s: float = 0.5,
|
min_keep_s: float = 0.5,
|
||||||
|
scene_end_s: float | None = None,
|
||||||
) -> tuple[float, float]:
|
) -> tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Estimate how long the source stays visually aligned with the beat.
|
Estimate how long the source stays visually aligned with the beat.
|
||||||
@@ -1032,6 +1055,17 @@ def estimate_usable_source_duration(
|
|||||||
break
|
break
|
||||||
|
|
||||||
tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
|
tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
|
||||||
|
|
||||||
|
if scene_end_s is not None:
|
||||||
|
avail_s = scene_end_s - in_point_s
|
||||||
|
if avail_s > 0 and last_good >= avail_s - (step_s * 2):
|
||||||
|
logger.info('Beat %d: Boundary hit: last_good=%.3f, avail_s=%.3f, step_s=%.3f. Disabling tail safety.', beat.beat_id, last_good, avail_s, step_s)
|
||||||
|
tail_safety_s = 0.0
|
||||||
|
last_good = max(last_good, avail_s)
|
||||||
|
else:
|
||||||
|
if avail_s > 0:
|
||||||
|
logger.info('Beat %d: Boundary NOT hit: last_good=%.3f, avail_s=%.3f, thresh=%.3f', beat.beat_id, last_good, avail_s, avail_s - (step_s * 2))
|
||||||
|
|
||||||
usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s))
|
usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s))
|
||||||
if usable < min_keep_s and scores:
|
if usable < min_keep_s and scores:
|
||||||
usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s))
|
usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s))
|
||||||
@@ -1071,6 +1105,7 @@ def refine_in_point_with_sequence(
|
|||||||
estimated_in_point_s: float,
|
estimated_in_point_s: float,
|
||||||
cfg: AppConfig,
|
cfg: AppConfig,
|
||||||
search_window_s: float | None = None,
|
search_window_s: float | None = None,
|
||||||
|
scene_end_s: float | None = None,
|
||||||
) -> tuple[float, float]:
|
) -> tuple[float, float]:
|
||||||
"""
|
"""
|
||||||
Refine a rough source in-point by comparing several frames across the beat.
|
Refine a rough source in-point by comparing several frames across the beat.
|
||||||
@@ -1078,7 +1113,7 @@ def refine_in_point_with_sequence(
|
|||||||
Returns:
|
Returns:
|
||||||
(best_in_point_s, sequence_score)
|
(best_in_point_s, sequence_score)
|
||||||
"""
|
"""
|
||||||
return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s)
|
return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s)
|
||||||
|
|
||||||
|
|
||||||
def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
|
def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
|
||||||
@@ -1357,23 +1392,51 @@ def run_global_scan(
|
|||||||
midpoint_t = coarse_in_s + (b.duration_s / 2)
|
midpoint_t = coarse_in_s + (b.duration_s / 2)
|
||||||
fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg)
|
fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg)
|
||||||
rough_in_s = max(0.0, fine_t - (b.duration_s / 2))
|
rough_in_s = max(0.0, fine_t - (b.duration_s / 2))
|
||||||
|
# Don't let midpoint refinement jump to a different scene — a
|
||||||
|
# scene boundary crossed here means a wrong-content frame won
|
||||||
|
# the template match. Revert so the coarse candidate retains
|
||||||
|
# its original scene context.
|
||||||
|
if scenes:
|
||||||
|
coarse_scene_check = _find_scene_for_time(scenes, coarse_in_s, cfg)
|
||||||
|
rough_scene_check = _find_scene_for_time(scenes, rough_in_s, cfg)
|
||||||
|
if (
|
||||||
|
coarse_scene_check is not None
|
||||||
|
and rough_scene_check is not None
|
||||||
|
and coarse_scene_check.scene_id != rough_scene_check.scene_id
|
||||||
|
):
|
||||||
|
rough_in_s = coarse_in_s
|
||||||
local_align_window_s = (
|
local_align_window_s = (
|
||||||
min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds)
|
min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds)
|
||||||
if is_weighted_seed_candidate
|
if is_weighted_seed_candidate
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
# When rough_in_s is near a scene boundary, filter templates to
|
||||||
|
# stay within the available source window so cross-boundary frames
|
||||||
|
# from unrelated content don't corrupt alignment and scoring.
|
||||||
|
rough_scene_for_boundary = _find_scene_for_time(scenes, rough_in_s, cfg) if scenes else None
|
||||||
|
rough_scene_end_s: float | None = None
|
||||||
|
if rough_scene_for_boundary is not None and matchable_duration_s > 0:
|
||||||
|
_avail = float(rough_scene_for_boundary.end_s) - rough_in_s
|
||||||
|
if 0 < _avail < matchable_duration_s:
|
||||||
|
rough_scene_end_s = float(rough_scene_for_boundary.end_s)
|
||||||
refined_in_s, sequence_score = refine_in_point_with_sequence(
|
refined_in_s, sequence_score = refine_in_point_with_sequence(
|
||||||
b,
|
b,
|
||||||
rough_in_s,
|
rough_in_s,
|
||||||
cfg,
|
cfg,
|
||||||
search_window_s=local_align_window_s,
|
search_window_s=local_align_window_s,
|
||||||
|
scene_end_s=rough_scene_end_s,
|
||||||
)
|
)
|
||||||
scene = _find_scene_for_time(scenes, refined_in_s, cfg)
|
scene = _find_scene_for_time(scenes, refined_in_s, cfg)
|
||||||
scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps
|
scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps
|
||||||
adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg)
|
adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg)
|
||||||
adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene)
|
adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene)
|
||||||
scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
|
scene = _find_scene_for_time(scenes, adjusted_in_s, cfg)
|
||||||
usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg)
|
usable_duration_s, span_score = estimate_usable_source_duration(
|
||||||
|
b,
|
||||||
|
adjusted_in_s,
|
||||||
|
cfg,
|
||||||
|
scene_end_s=float(scene.end_s) if scene is not None else None
|
||||||
|
)
|
||||||
out_s = adjusted_in_s + usable_duration_s
|
out_s = adjusted_in_s + usable_duration_s
|
||||||
if scene is not None:
|
if scene is not None:
|
||||||
out_s = min(out_s, scene.end_s)
|
out_s = min(out_s, scene.end_s)
|
||||||
@@ -1385,6 +1448,7 @@ def run_global_scan(
|
|||||||
adjusted_in_s,
|
adjusted_in_s,
|
||||||
validation_templates,
|
validation_templates,
|
||||||
cfg,
|
cfg,
|
||||||
|
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
|
||||||
)
|
)
|
||||||
content_score = original_content_score
|
content_score = original_content_score
|
||||||
content_in_s, align_content_score = align_in_point_by_content(
|
content_in_s, align_content_score = align_in_point_by_content(
|
||||||
@@ -1396,6 +1460,7 @@ def run_global_scan(
|
|||||||
if local_align_window_s is not None
|
if local_align_window_s is not None
|
||||||
else min(0.8, cfg.cv.deep_scan.content_align_window_seconds)
|
else min(0.8, cfg.cv.deep_scan.content_align_window_seconds)
|
||||||
),
|
),
|
||||||
|
scene_end_s=rough_scene_end_s,
|
||||||
)
|
)
|
||||||
if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds:
|
if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds:
|
||||||
with open_video(cfg.paths.source_movie) as validation_cap:
|
with open_video(cfg.paths.source_movie) as validation_cap:
|
||||||
@@ -1404,6 +1469,7 @@ def run_global_scan(
|
|||||||
content_in_s,
|
content_in_s,
|
||||||
validation_templates,
|
validation_templates,
|
||||||
cfg,
|
cfg,
|
||||||
|
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
|
||||||
)
|
)
|
||||||
if aligned_content_score >= original_content_score + 0.01:
|
if aligned_content_score >= original_content_score + 0.01:
|
||||||
adjusted_in_s = content_in_s
|
adjusted_in_s = content_in_s
|
||||||
@@ -1459,6 +1525,7 @@ def run_global_scan(
|
|||||||
adjusted_in_s,
|
adjusted_in_s,
|
||||||
validation_templates,
|
validation_templates,
|
||||||
cfg,
|
cfg,
|
||||||
|
max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
motion_score = original_motion_score
|
motion_score = original_motion_score
|
||||||
|
|||||||
Reference in New Issue
Block a user