Add hi-res phase refinement for intra-scene phase matching (Beat 03 investigation)

2026-05-08 10:52:11 +02:00
parent 18c8c89ee6
commit f20f89b06b
1 changed files with 355 additions and 16 deletions
@@ -198,6 +198,158 @@ def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarr
    )


+def _hires_phase_feature(frame: np.ndarray) -> np.ndarray:
+    """High-resolution normalised luma feature for intra-scene phase matching.
+
+    Standard pipeline features (160×80) lose the subtle pixel differences
+    between talking-head phases (mouth open vs. closed). This 320×160 feature
+    with an 8×8 spatial histogram grid provides enough spatial resolution to
+    discriminate facial expression phases within a single continuous scene.
+    """
+    trimmed = _trim_dark_borders(frame)
+    h, w = trimmed.shape[:2]
+    cropped = trimmed[int(h * 0.05):int(h * 0.95), :]
+    gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY)
+    gray = cv2.equalizeHist(gray)
+    resized = cv2.resize(gray, (320, 160), interpolation=cv2.INTER_AREA)
+    return resized
+
+
+def _hires_spatial_hist(frame_feature: np.ndarray) -> np.ndarray:
+    """8×8 grid spatial colour histogram from a hi-res luma feature."""
+    h, w = frame_feature.shape[:2]
+    grid = 8
+    cell_h = h // grid
+    cell_w = w // grid
+    parts: list[np.ndarray] = []
+    for gy in range(grid):
+        for gx in range(grid):
+            cell = frame_feature[gy * cell_h:(gy + 1) * cell_h,
+                                 gx * cell_w:(gx + 1) * cell_w]
+            hist = cv2.calcHist([cell], [0], None, [24], [0, 256]).astype(np.float32).flatten()
+            parts.append(hist / (float(np.sum(hist)) + 1e-6))
+    return np.concatenate(parts)
+
+
+def _hires_phase_score(
+    ref_feature: np.ndarray,
+    ref_spatial: np.ndarray,
+    src_frame: np.ndarray,
+) -> float:
+    """Compare a source frame to a reference using hi-res phase features.
+
+    Uses three signals:
+    1. Full-frame NCC for overall similarity
+    2. Center-crop NCC for face/expression matching (key for talking heads)
+    3. Spatial histogram for structural layout
+    """
+    src_feat = _hires_phase_feature(src_frame)
+    # Full-frame NCC
+    ncc_full = float(cv2.matchTemplate(
+        src_feat, ref_feature, cv2.TM_CCOEFF_NORMED
+    )[0][0])
+    # Center-crop NCC (face region — the center 40% of the frame)
+    h, w = ref_feature.shape[:2]
+    cy, cx = h // 2, w // 2
+    ch, cw = int(h * 0.20), int(w * 0.20)
+    ref_center = ref_feature[cy - ch:cy + ch, cx - cw:cx + cw]
+    src_center = src_feat[cy - ch:cy + ch, cx - cw:cx + cw]
+    ncc_center = float(cv2.matchTemplate(
+        src_center, ref_center, cv2.TM_CCOEFF_NORMED
+    )[0][0])
+    # Spatial histogram similarity
+    src_spatial = _hires_spatial_hist(src_feat)
+    spatial = _hist_intersection(ref_spatial, src_spatial)
+    return ncc_full * 0.25 + ncc_center * 0.45 + spatial * 0.30
+
+
+def _hires_phase_refine(
+    beat: TrailerBeat,
+    in_point_s: float,
+    scene_start_s: float,
+    scene_end_s: float,
+    cfg: AppConfig,
+) -> float:
+    """Re-scan the full source scene at high resolution to correct phase.
+
+    This is applied as a final refinement step after the standard pipeline
+    has identified the correct scene. It addresses the case where low-res
+    features cannot distinguish between different phases of the same shot
+    (e.g. mouth open vs. closed in a talking-head close-up).
+    """
+    # Build hi-res templates from only the stable, bright reference frames
+    # before any fade begins.  Fading frames have dropping brightness that
+    # would penalise correct source positions where those offsets map to
+    # bright content in the source.
+    matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=0.04)
+    ref_templates: list[tuple[float, np.ndarray, np.ndarray, float]] = []
+    step_s = max(1.0 / cfg.export.edl_frame_rate, 0.04)
+    t = 0.0
+    while t <= matchable_s:
+        frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t)
+        if frame is not None and _is_scoreable_reference_frame(frame, cfg):
+            mean_l, p90_l, contrast = _reference_visibility_stats(frame, cfg)
+            # Only use clearly visible frames (skip dimming fade frames)
+            if mean_l >= 50.0 and contrast >= 40.0:
+                feat = _hires_phase_feature(frame)
+                spatial = _hires_spatial_hist(feat)
+                ref_templates.append((t, feat, spatial, mean_l))
+        t = round(t + step_s, 6)
+
+    if not ref_templates:
+        return in_point_s
+
+    # For very short matchable durations (fast fades / cross-dissolves),
+    # keep only the brightest template.  When the beat fades quickly the
+    # later templates are dim and penalise every bright source candidate
+    # equally, destroying phase discrimination.  A single bright anchor
+    # gives maximum selectivity.
+    if matchable_s < 1.0 and len(ref_templates) > 1:
+        ref_templates.sort(key=lambda x: x[3], reverse=True)
+        ref_templates = ref_templates[:1]
+        logger.debug(
+            'Beat %d: hi-res using single brightest template at offset %.3fs (luma %.1f)',
+            beat.beat_id, ref_templates[0][0], ref_templates[0][3],
+        )
+
+    # Strip the luma field for the scan loop
+    scan_templates = [(off, feat, sp) for off, feat, sp, _ in ref_templates]
+    max_ref_offset = max(off for off, _, _ in scan_templates)
+
+    # Scan the full scene
+    best_t = in_point_s
+    best_score = -1.0
+    scan_step_s = max(1.0 / (cfg.export.edl_frame_rate or 24.0), 0.04)
+
+    with open_video(cfg.paths.source_movie) as cap:
+        t = scene_start_s
+        while t + max_ref_offset <= scene_end_s:
+            scores: list[float] = []
+            all_ok = True
+            for off, ref_feat, ref_spatial in scan_templates:
+                src_frame = grab_frame_at(cap, t + off)
+                if src_frame is None:
+                    all_ok = False
+                    break
+                scores.append(_hires_phase_score(ref_feat, ref_spatial, src_frame))
+            if all_ok and scores:
+                avg = sum(scores) / len(scores)
+                combined = avg * 0.7 + min(scores) * 0.3
+                if combined > best_score:
+                    best_score = combined
+                    best_t = t
+            t = round(t + scan_step_s, 6)
+
+    if best_t != in_point_s:
+        logger.info(
+            'Beat %d: hi-res phase refine moved in-point %.3fs -> %.3fs '
+            '(delta=%.3fs, score=%.4f)',
+            beat.beat_id, in_point_s, best_t,
+            best_t - in_point_s, best_score,
+        )
+    return best_t
+
+
 def _fixed_content_pair_score(
    ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray],
    source_frame: np.ndarray,
@@ -388,12 +540,36 @@ def _rerank_candidates_by_content(
    reranked: list[tuple[float, float, float]] = []
    with open_video(cfg.paths.source_movie) as cap:
        for coarse_score, t_sec in candidates:
-            content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
+            # If the candidate lands just before a scene boundary, also evaluate
+            # the start of the next scene.  A coarse-scan offset can place the
+            # in-point a few frames into the preceding (wrong) scene, causing
+            # the content and coverage scores to be artificially low even though
+            # the next scene is the correct visual match.
+            eval_t = t_sec
+            if scenes is not None:
+                cur_scene = _find_scene_for_time(scenes, t_sec, cfg)
+                if cur_scene is not None:
+                    remaining = float(cur_scene.end_s) - t_sec
+                    next_idx = next(
+                        (i + 1 for i, s in enumerate(scenes) if s.scene_id == cur_scene.scene_id),
+                        None,
+                    )
+                    if (
+                        remaining < cfg.cv.deep_scan.scene_boundary_epsilon_s * 4
+                        and next_idx is not None
+                        and next_idx < len(scenes)
+                    ):
+                        next_scene_start = float(scenes[next_idx].start_s)
+                        alt_content = _fixed_content_sequence_score(cap, next_scene_start, templates, cfg)
+                        cur_content = _fixed_content_sequence_score(cap, t_sec, templates, cfg)
+                        if alt_content > cur_content:
+                            eval_t = next_scene_start
+            content_score = _fixed_content_sequence_score(cap, eval_t, templates, cfg)
            coverage_score = 1.0
            if scenes is not None and matchable_duration_s and matchable_duration_s > 0:
                usable_s = _contiguous_scene_coverage_duration(
                    beat,
-                    t_sec,
+                    eval_t,
                    scenes,
                    matchable_duration_s,
                    cfg,
@@ -404,7 +580,7 @@ def _rerank_candidates_by_content(
                + coarse_score * 0.18
                + coverage_score * 0.20
            )
-            reranked.append((rank_score, coarse_score, t_sec))
+            reranked.append((rank_score, coarse_score, eval_t))

    return sorted(reranked, key=lambda item: item[0], reverse=True)

@@ -772,6 +948,8 @@ def _content_alignment_score(
    in_point_s: float,
    templates: list[tuple[float, np.ndarray]],
    cfg: AppConfig,
+    fps: float | None = None,
+    frame_cache: dict[int, np.ndarray] | None = None,
 ) -> float:
    if not templates:
        return -1.0
@@ -782,7 +960,13 @@ def _content_alignment_score(
    early_scores: list[float] = []

    for offset_s, template in templates:
-        frame = grab_frame_at(cap, in_point_s + offset_s)
+        t0 = in_point_s + offset_s
+        if frame_cache is not None and fps is not None:
+            idx = int(round(t0 * fps))
+            frame = frame_cache.get(idx)
+        else:
+            frame = grab_frame_at(cap, t0)
+            
        if frame is None:
            return -1.0

@@ -840,6 +1024,20 @@ def align_in_point_by_content(
        end_s = estimated_in_point_s + window_s
        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta

+        min_offset = min(off for off, _ in templates)
+        max_offset = max(off for off, _ in templates)
+        req_start_s = max(0.0, start_s + min_offset - frame_step_s)
+        req_end_s = end_s + max_offset + frame_step_s
+        
+        frame_cache = {}
+        t_req = req_start_s
+        while t_req <= req_end_s:
+            idx = int(round(t_req * fps))
+            frame = grab_frame_at(cap, t_req)
+            if frame is not None:
+                frame_cache[idx] = frame
+            t_req = round(t_req + frame_step_s, 6)
+
        best_in = estimated_in_point_s
        best_score = -1.0
        t = start_s
@@ -852,7 +1050,7 @@ def align_in_point_by_content(
                    active_templates = []
            else:
                active_templates = templates
-            score = _content_alignment_score(cap, t, active_templates, cfg) if active_templates else -1.0
+            score = _content_alignment_score(cap, t, active_templates, cfg, fps=fps, frame_cache=frame_cache) if active_templates else -1.0
            if score > best_score + tie_delta:
                best_score = score
                best_in = t
@@ -868,11 +1066,23 @@ def _motion_phase_score(
    in_point_s: float,
    motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]],
    cfg: AppConfig,
+    fps: float | None = None,
+    frame_cache: dict[int, np.ndarray] | None = None,
 ) -> float:
    scores: list[float] = []
    for offset_s, step_s, ref_delta, template_shape in motion_templates:
-        f0 = grab_frame_at(cap, in_point_s + offset_s)
-        f1 = grab_frame_at(cap, in_point_s + offset_s + step_s)
+        t0 = in_point_s + offset_s
+        t1 = in_point_s + offset_s + step_s
+        
+        if frame_cache is not None and fps is not None:
+            idx0 = int(round(t0 * fps))
+            idx1 = int(round(t1 * fps))
+            f0 = frame_cache.get(idx0)
+            f1 = frame_cache.get(idx1)
+        else:
+            f0 = grab_frame_at(cap, t0)
+            f1 = grab_frame_at(cap, t1)
+            
        if f0 is None or f1 is None:
            return -1.0
        src0 = _fixed_feature(f0, template_shape, cfg)
@@ -913,11 +1123,25 @@ def align_in_point_by_motion(
        end_s = estimated_in_point_s + window_s
        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta

+        min_offset = min(off for off, _, _, _ in motion_templates)
+        max_offset = max(off + step for off, step, _, _ in motion_templates)
+        req_start_s = max(0.0, start_s + min_offset - frame_step_s)
+        req_end_s = end_s + max_offset + frame_step_s
+        
+        frame_cache = {}
+        t_req = req_start_s
+        while t_req <= req_end_s:
+            idx = int(round(t_req * fps))
+            frame = grab_frame_at(cap, t_req)
+            if frame is not None:
+                frame_cache[idx] = frame
+            t_req = round(t_req + frame_step_s, 6)
+
        best_in = estimated_in_point_s
        best_score = -1.0
        t = start_s
        while t <= end_s:
-            score = _motion_phase_score(cap, t, motion_templates, cfg)
+            score = _motion_phase_score(cap, t, motion_templates, cfg, fps=fps, frame_cache=frame_cache)
            if score > best_score + tie_delta:
                best_score = score
                best_in = t
@@ -933,6 +1157,7 @@ def align_in_point_by_content_and_motion(
    estimated_in_point_s: float,
    cfg: AppConfig,
    search_window_s: float | None = None,
+    scene_end_s: float | None = None,
 ) -> tuple[float, float, float, float]:
    """
    Align a candidate using still-frame content and motion phase together.
@@ -959,23 +1184,57 @@ def align_in_point_by_content_and_motion(
        end_s = estimated_in_point_s + window_s
        tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta

+        min_t_offset = min(off for off, _ in templates) if templates else 0.0
+        max_t_offset = max(off for off, _ in templates) if templates else 0.0
+        min_m_offset = min(off for off, _, _, _ in motion_templates) if motion_templates else 0.0
+        max_m_offset = max(off + step for off, step, _, _ in motion_templates) if motion_templates else 0.0
+        
+        min_offset = min(min_t_offset, min_m_offset)
+        max_offset = max(max_t_offset, max_m_offset)
+        req_start_s = max(0.0, start_s + min_offset - frame_step_s)
+        req_end_s = end_s + max_offset + frame_step_s
+        
+        frame_cache = {}
+        t_req = req_start_s
+        while t_req <= req_end_s:
+            idx = int(round(t_req * fps))
+            frame = grab_frame_at(cap, t_req)
+            if frame is not None:
+                frame_cache[idx] = frame
+            t_req = round(t_req + frame_step_s, 6)
+
        best_in = estimated_in_point_s
        best_score = -1.0
        best_content = -1.0
        best_motion = -1.0
        t = start_s
        while t <= end_s:
-            content_score = _content_alignment_score(cap, t, templates, cfg)
+            if scene_end_s is not None:
+                avail_s = scene_end_s - t
+                if avail_s > 0:
+                    active_templates = [(off, tpl) for off, tpl in templates if off <= avail_s]
+                    active_motion = [(off, step, delta, shape) for off, step, delta, shape in motion_templates if off + step <= avail_s]
+                else:
+                    active_templates = []
+                    active_motion = []
+            else:
+                active_templates = templates
+                active_motion = motion_templates
+
+            content_score = _content_alignment_score(cap, t, active_templates, cfg, fps=fps, frame_cache=frame_cache) if active_templates else -1.0
            motion_score = (
-                _motion_phase_score(cap, t, motion_templates, cfg)
-                if len(motion_templates) >= 2
+                _motion_phase_score(cap, t, active_motion, cfg, fps=fps, frame_cache=frame_cache)
+                if len(active_motion) >= 2
                else content_score
            )
            if content_score < 0 or motion_score < 0:
                t = round(t + frame_step_s, 6)
                continue
            raw_score = content_score * 0.64 + motion_score * 0.36
-            anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.05)
+            # The previous anchor_penalty of 0.05 per second was stronger than the
+            # actual variance in raw_score, preventing phase correction. We reduce it
+            # so that it only acts as a tie-breaker.
+            anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.005)
            score = raw_score - anchor_penalty
            if score > best_score + tie_delta:
                best_score = score
@@ -1027,6 +1286,18 @@ def estimate_usable_source_duration(
            frame = grab_frame_at(cap, in_point_s + offset_s)
            if frame is None:
                break
+            
+            # If the template is scoreable (has content) but the source frame is dark,
+            # this is a bad match. We should not let dark source frames
+            # provide high correlation to dark templates.
+            # templates are already pre-processed into feature images (grayscale/edges),
+            # so we can't use _is_scoreable_reference_frame on them directly.
+            # Instead, we rely on the fact that _prepare_beat_templates already
+            # filtered out non-scoreable frames.
+            if _is_dark_reference_frame(frame, cfg):
+                scores.append((offset_s, 0.0))
+                continue
+
            scores.append((offset_s, _match_score(frame, template, cfg)))

    if not scores:
@@ -1038,12 +1309,14 @@ def estimate_usable_source_duration(

    last_good = 0.0
    bad_run = 0
+    bad_run_start_offset: float | None = None
    good_scores: list[float] = []

    for offset_s, score in scores:
        if score >= min_score:
            last_good = offset_s
            bad_run = 0
+            bad_run_start_offset = None
            good_scores.append(score)
            continue

@@ -1051,7 +1324,34 @@ def estimate_usable_source_duration(
            continue

        bad_run += 1
+        if bad_run_start_offset is None:
+            bad_run_start_offset = offset_s
        if bad_run >= 3:
+            # Before killing the span, check whether the remaining scores form a
+            # stable plateau. This handles scenes where a grading/exposure
+            # difference between trailer and source causes a gradual score drop
+            # rather than a hard cut. A genuine cut produces chaotic scores;
+            # a grading mismatch produces a flat, low-but-consistent plateau.
+            # Conditions: low variance (std < 0.025), scores above pure-black
+            # (mean > 0.20), and the warmup baseline was meaningful (>= 0.30).
+            tail_scores = [s for o, s in scores if o >= bad_run_start_offset]
+            if (
+                len(tail_scores) >= 3
+                and float(np.std(tail_scores)) < 0.025
+                and float(np.mean(tail_scores)) > 0.20
+                and baseline >= 0.30
+            ):
+                logger.debug(
+                    'Beat %d: stable plateau detected at offset %.3fs '
+                    '(tail mean=%.3f std=%.3f) — extending span to full duration.',
+                    beat.beat_id, bad_run_start_offset,
+                    float(np.mean(tail_scores)), float(np.std(tail_scores)),
+                )
+                last_good = scores[-1][0]
+                good_scores.extend(tail_scores)
+                break
+            logger.debug('Beat %d: Match died at offset %.3fs. Score %.3f < min_score %.3f. Bad run count: %d', 
+                         beat.beat_id, offset_s, score, min_score, bad_run)
            break

    tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps)
@@ -1113,7 +1413,10 @@ def refine_in_point_with_sequence(
    Returns:
        (best_in_point_s, sequence_score)
    """
-    return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s, scene_end_s)
+    best_in, best_score, _, _ = align_in_point_by_content_and_motion(
+        beat, estimated_in_point_s, cfg, search_window_s, scene_end_s
+    )
+    return best_in, best_score


 def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig):
@@ -1451,7 +1754,7 @@ def run_global_scan(
                        max_source_duration_s=duration_s if rough_scene_end_s is not None else None,
                    )
                content_score = original_content_score
-                content_in_s, align_content_score = align_in_point_by_content(
+                content_in_s, _, align_content_score, _ = align_in_point_by_content_and_motion(
                    b,
                    adjusted_in_s,
                    cfg,
@@ -1495,7 +1798,7 @@ def run_global_scan(
                            cfg,
                        )
                    
-                    motion_in_s, align_motion_score = align_in_point_by_motion(
+                    motion_in_s, _, _, align_motion_score = align_in_point_by_content_and_motion(
                        b,
                        adjusted_in_s,
                        cfg,
@@ -1504,6 +1807,7 @@ def run_global_scan(
                            if local_align_window_s is not None
                            else min(1.0, cfg.cv.deep_scan.content_align_window_seconds)
                        ),
+                        scene_end_s=rough_scene_end_s,
                    )
                    
                    if align_motion_score >= original_motion_score + 0.015:
@@ -1561,7 +1865,12 @@ def run_global_scan(
                )
                if len(motion_templates) >= 2:
                    motion_score_clamped = max(0.0, min(1.0, motion_score))
-                    final_score = final_score * 0.82 + motion_score_clamped * 0.18
+                    blended = final_score * 0.82 + motion_score_clamped * 0.18
+                    # Do not let motion blending drag the score below the
+                    # content-validated level.  A weak motion score often just
+                    # means the shot contains a camera pan or slow zoom; it
+                    # should not veto an otherwise well-supported content match.
+                    final_score = max(blended, final_score - 0.015)
                if is_weighted_seed_candidate:
                    vision_provisional_score = (
                        content_score * 0.45
@@ -1741,6 +2050,36 @@ def run_global_scan(
                best_result.match_score,
            )

+            # Final hi-res phase refinement: scan the full source scene at
+            # higher resolution to correct phase mismatches that the standard
+            # 160×80 features cannot resolve (e.g. talking-head close-ups).
+            final_in_s = best_result.in_point_s
+            final_scene = _find_scene_for_time(scenes, final_in_s, cfg)
+            if final_scene is not None:
+                refined_phase_in_s = _hires_phase_refine(
+                    b,
+                    final_in_s,
+                    float(final_scene.start_s),
+                    float(final_scene.end_s),
+                    cfg,
+                )
+                if refined_phase_in_s != final_in_s:
+                    final_in_s = refined_phase_in_s
+                    # Recompute out-point preserving the duration
+                    final_out_s = final_in_s + best_result.duration_s
+                    if final_scene is not None:
+                        final_out_s = min(final_out_s, float(final_scene.end_s))
+                    best_result = MatchResult(
+                        beat_id=b.beat_id,
+                        scene_id=best_result.scene_id,
+                        source_path=cfg.paths.source_movie,
+                        in_point_s=final_in_s,
+                        out_point_s=final_out_s,
+                        in_point_frame=int(final_in_s * source_fps),
+                        match_score=best_result.match_score,
+                        is_confirmed=is_confirmed,
+                    )
+
            results.append(MatchResult(
                beat_id=b.beat_id,
                scene_id=best_result.scene_id,