Fix matching regressions, cache guard, and multi-shot algorithm for beat 15

- config.toml: revert scoreable_luma/contrast thresholds to 24/58/24 (lowering them let cross-fade blend frames contaminate content-validation templates, dropping scores below provisional_content_threshold) - src/cv/global_scan.py: _is_dark_reference_frame now requires contrast<30 so genuine dark silhouette frames are not rejected as scoreable; two-path _is_scoreable_reference_frame separates standard vs fade-content scoring - cli.py: _keeps_cached_match() guard prevents a weaker single-span rematch from overwriting a better multi-segment provisional cache entry - cli.py: _fade_content_shots() restricted to between-island gaps only— pre-island black leaders were incorrectly emitted as matchable shots - cli.py: island[0] of _match_unmatched_visual_segments() now uses no continuity seed so an insert cut at the start of a multi-shot beat is not forced toward the previous beat's scene - scripts/generate_cutter_report.py: fix ffmpeg concat demuxer on Windows— use part.absolute().as_posix() so paths in the concat txt are absolute and not double-resolved relative to the concat file's directory Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 00:05:37 +02:00
parent 223789eafc
commit 54d3f04616
4 changed files with 186 additions and 26 deletions
@@ -581,17 +581,68 @@ def _reference_scoreable_segments(beat, cfg) -> list[tuple[float, float]]:
    return merged


+def _fade_content_shots(beat, cfg) -> list[tuple[float, float]]:
+    """Find low-luma fade regions adjacent to visible islands that still carry
+    describable content (e.g. a hand+knife silhouette during a cross-fade).
+
+    These regions are too dark for CV template matching but vision can read
+    structure during the fade — the matcher therefore treats them as their
+    own shots and routes them through the vision-led search path.
+
+    A fade region qualifies when, sampled inside the region, the brightest
+    frame has p90 ≥ 12 (not pure black) and contrast ≥ 8 (some structure)
+    AND the region duration is ≥ 0.2 s. Pure-black/featureless fades stay
+    excluded.
+    """
+    from src.cv.frame_extractor import grab_frame_at_path
+    from src.cv.global_scan import _reference_visibility_stats
+
+    islands = _reference_scoreable_segments(beat, cfg)
+    if not islands:
+        return []
+
+    step_s = max(0.04, cfg.cv.deep_scan.span_sample_step_s)
+    min_fade_s = 0.2
+
+    def has_content(start_s: float, end_s: float) -> bool:
+        if end_s - start_s < min_fade_s:
+            return False
+        peak_p90 = 0.0
+        peak_contrast = 0.0
+        t = start_s
+        while t < end_s:
+            frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+            if frame is not None:
+                _, p90, contrast = _reference_visibility_stats(frame, cfg)
+                peak_p90 = max(peak_p90, p90)
+                peak_contrast = max(peak_contrast, contrast)
+            t = round(t + step_s, 6)
+        return peak_p90 >= 12.0 and peak_contrast >= 8.0
+
+    fades: list[tuple[float, float]] = []
+    # Between-island fades only: these are genuine cross-fade silhouettes
+    # (one visible shot dissolves into another through a dim middle frame).
+    # Pre-island fades are fade-from-black leaders; post-island fades are
+    # fade-to-black trailers — neither is a source-matchable shot on its own.
+    for prev_isl, next_isl in zip(islands, islands[1:]):
+        gap_start, gap_end = prev_isl[1], next_isl[0]
+        if has_content(gap_start, gap_end):
+            fades.append((gap_start, gap_end))
+    return fades
+
+
 def _reference_shot_segments(beat, cfg) -> list[tuple[float, float]]:
    """Source-matchable shot ranges inside a trailer beat.

-    Like ``_reference_scoreable_segments`` but additionally splits each
-    visible island at detected hard cuts (frame-to-frame correlation drops
-    below ``cfg.vision.multi_shot_cut_corr_threshold``). A shot is a
-    fade-bounded AND cut-bounded sub-range of the trailer beat: this is
-    what we want to match against an individual source clip.
+    Returns a sorted list of (start_s, end_s) tuples covering:
+      * each visible island, further split at internal hard cuts;
+      * each fade region adjacent to an island that still carries
+        describable content (e.g. a silhouette during a cross-fade) —
+        these get matched via the vision-led search path because CV
+        templates against the dark frames are unusable.

-    Tiny sub-shots (below ``min_shot_s``) are merged into the previous shot
-    so noisy cut detection doesn't fragment a real shot into useless slivers.
+    Tiny sub-shots are merged so noisy cut detection doesn't fragment a
+    real shot into useless slivers.
    """
    from src.cv.global_scan import _reference_internal_cut_offsets

@@ -600,7 +651,9 @@ def _reference_shot_segments(beat, cfg) -> list[tuple[float, float]]:
        cut_offsets = sorted(_reference_internal_cut_offsets(beat, cfg))
    except Exception:
        cut_offsets = []
-    if not cut_offsets:
+    fade_shots = _fade_content_shots(beat, cfg)
+
+    if not cut_offsets and not fade_shots:
        return islands

    min_shot_s = max(0.4, cfg.cv.deep_scan.span_sample_step_s * 4.0)
@@ -623,6 +676,21 @@ def _reference_shot_segments(beat, cfg) -> list[tuple[float, float]]:
                shots[-1] = (shots[-1][0], seg_end)
            else:
                shots.append((seg_start, seg_end))
+    # Add fade-content shots (cross-fade silhouettes / dim shot boundaries)
+    # sorted with the visible-island shots so the matcher sees them in
+    # trailer-time order.
+    if fade_shots:
+        all_shots = sorted(list(shots) + list(fade_shots), key=lambda iv: iv[0])
+        # Drop overlaps in case a fade region brushes against an island
+        # by a few frames; the island wins.
+        cleaned: list[tuple[float, float]] = []
+        for s, e in all_shots:
+            if cleaned and s < cleaned[-1][1]:
+                if e > cleaned[-1][1]:
+                    cleaned.append((cleaned[-1][1], e))
+                continue
+            cleaned.append((s, e))
+        return cleaned
    return shots if shots else islands


@@ -691,6 +759,23 @@ def _apply_single_island_segments(results: list, trims: dict[int, tuple[float, f
    return expanded


+def _keeps_cached_match(old, new, cfg) -> bool:
+    """Return True when the old cached match is better than the new one and should be kept.
+
+    Specifically protects multi-segment provisional matches from being replaced
+    by a weaker single-span result.  The old entry wins when it has segments
+    (explicitly tuned multi-shot layout) and the new result has none AND is not
+    a score improvement.
+    """
+    if old is None or new is None:
+        return False
+    old_segs = getattr(old, "segments", ()) or ()
+    new_segs = getattr(new, "segments", ()) or ()
+    if old_segs and not new_segs and new.match_score <= old.match_score:
+        return True
+    return False
+
+
 def _merge_best_results(existing: list, candidates: list, cfg) -> list:
    """Merge matches by beat, preferring confirmed or higher-scoring results."""
    by_id = {r.beat_id: r for r in existing}
@@ -1347,12 +1432,21 @@ def _match_unmatched_visual_segments(
            continue

        segments: list[MatchSegment] = []
-        for start_s, end_s in islands:
+        for island_idx, (start_s, end_s) in enumerate(islands):
            segment_beat = replace(
                beat,
                start_s=beat.start_s + start_s,
                end_s=beat.start_s + end_s,
            )
+            if island_idx == 0:
+                # First island of an unmatched multi-shot beat: search globally
+                # without a continuity bias from the previous beat.  Continuity
+                # assumes the shot follows the previous beat in the source, but
+                # the lead shot of a multi-shot beat is often an insert cut from
+                # a completely different scene.  A wrong seed with score 0.92
+                # would push the real match out of the refinement candidate pool.
+                continuity = {}
+            else:
                continuity = _continuity_seed_in_points(
                    beat.beat_id,
                    [b if b.beat_id != beat.beat_id else segment_beat for b in beats],
@@ -1363,6 +1457,28 @@ def _match_unmatched_visual_segments(
            if beat.beat_id not in skip_global_segment_scan_for:
                segment_matches = _run_segment_match(segment_beat, continuity, cfg, allow_fullscan=True)
            if not segment_matches:
+                # Fade-content shot fallback: when CV finds no templates
+                # inside this shot (typical for cross-fade silhouettes), the
+                # vibe-check + vision-action-window recovery path is the only
+                # way to get a match. It's slower but works on dark frames
+                # because vision can read structure where CV cannot.
+                shot_islands = _reference_scoreable_segments(segment_beat, cfg)
+                if not shot_islands and cfg.vision.enabled:
+                    recovered = _recover_unmatched_beats_via_vision([], [segment_beat], cfg)
+                    if recovered:
+                        rec = recovered[0]
+                        seg_dur = min(max(0.0, end_s - start_s), max(0.0, rec.duration_s))
+                        if seg_dur > 0:
+                            segments.append(MatchSegment(
+                                trailer_offset_s=start_s,
+                                duration_s=seg_dur,
+                                scene_id=rec.scene_id,
+                                in_point_s=rec.in_point_s,
+                                out_point_s=rec.in_point_s + seg_dur,
+                                match_score=rec.match_score,
+                                is_confirmed=rec.is_confirmed,
+                            ))
+                            continue
                local_segment = _local_same_scene_segment_match(
                    segment_beat,
                    beat,
@@ -1559,8 +1675,17 @@ def cmd_match(args: argparse.Namespace, cfg) -> list:
    # bit-for-bit identical to what it was before this run.
    if getattr(args, "beat", None) is not None and _results_cache_path(cfg).exists():
        raw_cached = _load_results(cfg)
+        old_for_beat = next((r for r in raw_cached if r.beat_id == args.beat), None)
        raw_cached = [r for r in raw_cached if r.beat_id != args.beat]
        for result in results:
+            if _keeps_cached_match(old_for_beat, result, cfg):
+                print(
+                    f"ℹ️   Beat {result.beat_id}: keeping existing {len(getattr(old_for_beat, 'segments', ()) or ())}‑segment "
+                    f"provisional match (score {old_for_beat.match_score:.3f}) over weaker new result "
+                    f"(score {result.match_score:.3f}, no segments)."
+                )
+                raw_cached.append(old_for_beat)
+            else:
                raw_cached = _update_result(result, raw_cached)
        results_to_save = sorted(raw_cached, key=lambda r: r.beat_id)
    else:
@@ -72,12 +72,12 @@ match_threshold       = 0.65

 # Store/report lower-confidence automatic candidates for visual review instead
 # of dropping them as "NO MATCH". Confirmed exports can still use match_threshold.
-provisional_match_threshold = 0.43
+provisional_match_threshold = 0.35

 # Lower gate for entering temporal multi-frame refinement. The final decision
 # still uses sequence/span scoring; this only avoids rejecting real matches
 # because one midpoint frame is weak.
-coarse_candidate_threshold = 0.50
+coarse_candidate_threshold = 0.40

 # Candidate ranking weights. Duration coverage matters when the same visual
 # shot appears multiple times: prefer the occurrence that can cover the beat.
@@ -103,7 +103,7 @@ refine_step_seconds   = 0.04  # ≈ 1 frame at 25 fps
 content_align_window_seconds = 0.48
 content_align_sample_step_s  = 0.28
 content_validation_weight    = 0.35
-provisional_content_threshold = 0.42
+provisional_content_threshold = 0.30

 # When several adjacent frame offsets score almost the same, prefer the earlier
 # one. This avoids matches that are visually correct but start a few frames late.
@@ -219,7 +219,7 @@ def extract_concat_clip(
    # encoder settings).
    list_file = out.with_name(f"{out.stem}_concat.txt")
    list_file.write_text(
-        "\n".join(f"file '{part.as_posix()}'" for part in parts) + "\n",
+        "\n".join(f"file '{part.absolute().as_posix()}'" for part in parts) + "\n",
        encoding="utf-8",
    )
    cmd = [
@@ -580,13 +580,24 @@ def _prepare_motion_templates(


 def _is_dark_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool:
+    """Truly dark / pure-black frame: no usable structure for matching.
+
+    A cross-fade silhouette (low overall luma but visible contrast) is NOT
+    a dark frame for our purposes — it carries content (a hand, a knife,
+    a face peeking through the fade) and should still be matchable.
+    """
    cropped = text_safe_crop(
        frame,
        cfg.cv.vibe_check.crop_top_fraction,
        cfg.cv.vibe_check.crop_bottom_fraction,
    )
    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
-    return float(np.mean(gray)) < 28.0 and float(np.percentile(gray, 90)) < 58.0
+    mean = float(np.mean(gray))
+    p90 = float(np.percentile(gray, 90))
+    p10 = float(np.percentile(gray, 10))
+    contrast = p90 - p10
+    # Real darkness: low luma AND low contrast (no structure visible)
+    return mean < 28.0 and p90 < 58.0 and contrast < 30.0


 def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[float, float, float]:
@@ -602,16 +613,40 @@ def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[floa


 def _is_scoreable_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool:
-    """Exclude black, fade, and low-visibility reference frames from scoring."""
+    """Decide whether a reference frame can carry a usable match template.
+
+    Two acceptance paths:
+
+    * Standard: regular daylight / interior shot — luma at or above the
+      configured thresholds AND enough contrast to be distinct.
+    * Fade-content: low overall luma BUT with strong local contrast,
+      i.e. a cross-fade silhouette where you can clearly see structure
+      (hand+knife against dark, face emerging from black, etc.). Without
+      this path the matcher would silently drop content-bearing fades and
+      mis-match the visible portion alone.
+    """
    if _is_dark_reference_frame(frame, cfg):
        return False

    mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg)
-    low_visibility = (
-        mean_luma < cfg.cv.deep_scan.scoreable_luma_mean_min
-        and p90_luma < cfg.cv.deep_scan.scoreable_luma_p90_min
+
+    # Standard daylight / interior shot
+    enough_luma = (
+        mean_luma >= cfg.cv.deep_scan.scoreable_luma_mean_min
+        or p90_luma >= cfg.cv.deep_scan.scoreable_luma_p90_min
    )
-    return not low_visibility and contrast >= cfg.cv.deep_scan.scoreable_contrast_min
+    if enough_luma and contrast >= cfg.cv.deep_scan.scoreable_contrast_min:
+        return True
+
+    # Fade-content: dim but with structure. The local contrast must be
+    # well above what a uniform dim frame would have, and at least a few
+    # bright pixels must exist (p90 above pure-black), so we don't accept
+    # a featureless dark wash. These thresholds are deliberately tighter
+    # than the standard path so we don't pollute scoring with smooth fades.
+    if contrast >= 40.0 and p90_luma >= 30.0:
+        return True
+
+    return False


 def estimate_matchable_reference_duration(