Per-shot match for beats with internal cuts; protect cache on --beat runs

Two issues fixed:

1. Beats with internal hard cuts (e.g. man-shot then back to woman) were
   being approximated by a single source clip because the multi-segment
   path only triggered for fade-bounded multi-island beats. Added
   _reference_shot_segments(), which returns the shot ranges by splitting
   each visible island at detected internal cuts. The multi-island gate in
   cmd_match and the per-island loop in _match_unmatched_visual_segments
   now use shots, so any beat with cuts > 0 produces one MatchSegment per
   shot. Each shot is matched independently against the source movie.

   Effect on Beat 10: 1 segment (3.32 s in scene 558) -> 3 segments
   covering shots 0-0.88 s, 0.88-2.64 s, 2.64-3.32 s in scenes 554, 559,
   556 respectively, with the previously missing "back to woman" cut now
   correctly placed in scene 556.

2. Targeted --beat N runs were silently dropping cache entries for other
   beats whose old scores no longer pass current quality gates
   (_normalize_cached_results runs at load time and removes them). The
   save path now re-loads the raw cache from disk and writes back every
   non-targeted beat verbatim, so a per-beat run can never regress
   another beat's stored match.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Melbar
2026-05-05 00:06:39 +02:00
parent 2a3840e528
commit cc27208d2a
3 changed files with 68 additions and 11 deletions
+64 -7
View File
@@ -562,6 +562,51 @@ def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
return merged
def _reference_shot_segments(beat, cfg) -> list[tuple[float, float]]:
"""Source-matchable shot ranges inside a trailer beat.
Like ``_reference_scoreable_segments`` but additionally splits each
visible island at detected hard cuts (frame-to-frame correlation drops
below ``cfg.vision.multi_shot_cut_corr_threshold``). A shot is a
fade-bounded AND cut-bounded sub-range of the trailer beat: this is
what we want to match against an individual source clip.
Tiny sub-shots (below ``min_shot_s``) are merged into the previous shot
so noisy cut detection doesn't fragment a real shot into useless slivers.
"""
from src.cv.global_scan import _reference_internal_cut_offsets
islands = _reference_scoreable_segments(beat, cfg)
try:
cut_offsets = sorted(_reference_internal_cut_offsets(beat, cfg))
except Exception:
cut_offsets = []
if not cut_offsets:
return islands
min_shot_s = max(0.4, cfg.cv.deep_scan.span_sample_step_s * 4.0)
shots: list[tuple[float, float]] = []
for start_s, end_s in islands:
boundaries = [start_s]
for cut in cut_offsets:
if start_s + 1e-3 < cut < end_s - 1e-3:
boundaries.append(cut)
boundaries.append(end_s)
for i in range(len(boundaries) - 1):
seg_start = boundaries[i]
seg_end = boundaries[i + 1]
if seg_end - seg_start < min_shot_s and shots and shots[-1][1] >= seg_start - 1e-3:
# merge into previous if the new piece is too short
shots[-1] = (shots[-1][0], seg_end)
elif seg_end - seg_start >= min_shot_s:
shots.append((seg_start, seg_end))
elif shots:
shots[-1] = (shots[-1][0], seg_end)
else:
shots.append((seg_start, seg_end))
return shots if shots else islands
def _trim_beats_to_single_visual_island(beats: list, cfg) -> tuple[list, dict[int, tuple[float, float]]]:
"""Use a single visible island as the primary match target for faded beats."""
from dataclasses import replace
@@ -1276,7 +1321,9 @@ def _match_unmatched_visual_segments(
if beat.beat_id in matched_ids:
continue
islands = _reference_scoreable_segments(beat, cfg)
# Per-shot matching when the beat has either fade-bounded islands
# OR internal hard cuts; each shot becomes its own MatchSegment.
islands = _reference_shot_segments(beat, cfg)
if not islands:
continue
@@ -1423,10 +1470,15 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
all_beats = _load_beats(cfg)
beats = _select_beats(all_beats, getattr(args, "beat", None))
cached = _normalize_cached_results(all_beats, _load_results(cfg), cfg) if _results_cache_path(cfg).exists() else []
# Multi-shot beats: either fade-bounded multiple islands, OR a single
# island with internal hard cuts (e.g. man-shot then back to woman). Both
# cases are routed through the per-segment match path so each shot gets
# its own source clip instead of being approximated by one continuous
# span.
multi_island_beat_ids = {
beat.beat_id
for beat in beats
if len(_reference_scoreable_segments(beat, cfg)) > 1
if len(_reference_shot_segments(beat, cfg)) > 1
}
scan_beats, single_island_trims = _trim_beats_to_single_visual_island(beats, cfg)
scan_beats = [b for b in scan_beats if b.beat_id not in multi_island_beat_ids]
@@ -1480,13 +1532,18 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
results = _filter_semantically_invalid_vision_matches(results, beats, cfg)
results = _recover_unmatched_beats_via_vision(results, beats, cfg)
# A targeted one-beat match should improve the cache without deleting
# automatic matches for other beats.
# A targeted one-beat match must NEVER delete or modify any other beat's
# cache entry. We deliberately re-load the raw cache from disk here so
# the upstream normalisation pass (which drops entries that no longer
# pass current quality gates) cannot leak into the save: only the
# targeted beat's slot gets replaced, every other entry is written back
# bit-for-bit identical to what it was before this run.
if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists():
cached = [r for r in cached if r.beat_id != args.beat]
raw_cached = _load_results(cfg)
raw_cached = [r for r in raw_cached if r.beat_id != args.beat]
for result in results:
cached = _update_result(result, cached)
results_to_save = cached
raw_cached = _update_result(result, raw_cached)
results_to_save = sorted(raw_cached, key=lambda r: r.beat_id)
else:
results_to_save = results