import logging import cv2 import numpy as np import subprocess as sp from typing import Sequence import time from dataclasses import replace from src.core.config import AppConfig from src.core.models import MatchResult, TrailerBeat from src.cv.fingerprinting import text_safe_crop from src.cv.frame_extractor import grab_frame_at_path, get_video_info, open_video, grab_frame_at logger = logging.getLogger(__name__) SeedPoint = float | tuple[float, float] _REFERENCE_CUT_CACHE: dict[tuple[str, float, float, float], list[float]] = {} def _prepare_template(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: proxy_w = cfg.video.proxy_width proxy_h = cfg.video.proxy_height cb = text_safe_crop( frame, cfg.cv.vibe_check.crop_top_fraction, cfg.cv.vibe_check.crop_bottom_fraction, ) rb = cv2.resize(cb, (proxy_w, proxy_h), interpolation=cv2.INTER_AREA) margin_y = int(proxy_h * 0.10) margin_x = int(proxy_w * 0.10) return _feature_image(rb[margin_y:proxy_h-margin_y, margin_x:proxy_w-margin_x]) def _prepare_haystack(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: cb = text_safe_crop( frame, cfg.cv.vibe_check.crop_top_fraction, cfg.cv.vibe_check.crop_bottom_fraction, ) rb = cv2.resize(cb, (cfg.video.proxy_width, cfg.video.proxy_height), interpolation=cv2.INTER_AREA) return _feature_image(rb) def _center_crop_feature(feature: np.ndarray, cfg: AppConfig) -> np.ndarray: h, w = feature.shape[:2] margin_y = int(h * 0.10) margin_x = int(w * 0.10) return feature[margin_y:h-margin_y, margin_x:w-margin_x] def _feature_image(frame: np.ndarray) -> np.ndarray: """ Convert frames to a look-tolerant matching feature. Trailer shots may be desaturated, contrast-shifted, or contain a different grade than the source movie. Matching luma plus edges is more stable than raw BGR pixels and rejects unrelated scenes with similar colors. """ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) gray = cv2.equalizeHist(gray) edges = cv2.Canny(gray, 60, 140) return cv2.addWeighted(gray, 0.70, edges, 0.30, 0) def _match_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: haystack = _prepare_haystack(frame, cfg) res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) _, max_val, _, _ = cv2.minMaxLoc(res) return float(max_val) def _fixed_position_score(frame: np.ndarray, template: np.ndarray, cfg: AppConfig) -> float: fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) if fixed.shape != template.shape: fixed = cv2.resize(fixed, (template.shape[1], template.shape[0]), interpolation=cv2.INTER_AREA) res = cv2.matchTemplate(fixed, template, cv2.TM_CCOEFF_NORMED) _, max_val, _, _ = cv2.minMaxLoc(res) return float(max_val) def _fixed_feature(frame: np.ndarray, template_shape: tuple[int, ...], cfg: AppConfig) -> np.ndarray: fixed = _center_crop_feature(_prepare_haystack(frame, cfg), cfg) if fixed.shape != template_shape: fixed = cv2.resize(fixed, (template_shape[1], template_shape[0]), interpolation=cv2.INTER_AREA) return fixed def _corr_same_size(a: np.ndarray, b: np.ndarray) -> float: if a.shape != b.shape: b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_AREA) res = cv2.matchTemplate(a, b, cv2.TM_CCOEFF_NORMED) _, max_val, _, _ = cv2.minMaxLoc(res) if np.isnan(max_val): return 0.0 return float(max_val) def _validation_crop(frame: np.ndarray) -> np.ndarray: frame = _trim_dark_borders(frame) h = frame.shape[0] return frame[int(h * 0.05):int(h * 0.95), :] def _trim_dark_borders(frame: np.ndarray) -> np.ndarray: """ Remove encoded black matte/pillarbox borders before fixed-position checks. The reference trailer can contain vertical black bars while the source movie does not. Whole-frame spatial validation should compare picture content, not container matte. """ if frame.size == 0: return frame gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) h, w = gray.shape[:2] col_signal = np.percentile(gray, 90, axis=0) row_signal = np.percentile(gray, 90, axis=1) active_cols = np.where(col_signal > 18.0)[0] active_rows = np.where(row_signal > 18.0)[0] if active_cols.size >= max(8, int(w * 0.35)): x0 = max(0, int(active_cols[0]) - 2) x1 = min(w, int(active_cols[-1]) + 3) else: x0, x1 = 0, w if active_rows.size >= max(8, int(h * 0.35)): y0 = max(0, int(active_rows[0]) - 2) y1 = min(h, int(active_rows[-1]) + 3) else: y0, y1 = 0, h if x1 - x0 < int(w * 0.35) or y1 - y0 < int(h * 0.35): return frame return frame[y0:y1, x0:x1] def _fixed_luma_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: cropped = _validation_crop(frame) gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) gray = cv2.equalizeHist(gray) resized = cv2.resize(gray, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) def _fixed_edge_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: cropped = _validation_crop(frame) gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) gray = cv2.equalizeHist(gray) edges = cv2.Canny(gray, 60, 140) resized = cv2.resize(edges, (160, 80), interpolation=cv2.INTER_AREA).astype(np.float32) return (resized - float(np.mean(resized))) / (float(np.std(resized)) + 1e-6) def _fixed_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: cropped = _validation_crop(frame) resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) chans = cv2.split(resized) parts = [] for channel in chans: hist = cv2.calcHist([channel], [0], None, [32], [0, 256]).astype(np.float32).flatten() parts.append(hist / (float(np.sum(hist)) + 1e-6)) return np.concatenate(parts) def _fixed_spatial_hist_feature(frame: np.ndarray, cfg: AppConfig) -> np.ndarray: cropped = _validation_crop(frame) resized = cv2.resize(cropped, (160, 80), interpolation=cv2.INTER_AREA) grid_y = 4 grid_x = 4 cell_h = resized.shape[0] // grid_y cell_w = resized.shape[1] // grid_x parts = [] for gy in range(grid_y): for gx in range(grid_x): cell = resized[gy * cell_h:(gy + 1) * cell_h, gx * cell_w:(gx + 1) * cell_w, :] for channel in cv2.split(cell): hist = cv2.calcHist([channel], [0], None, [16], [0, 256]).astype(np.float32).flatten() parts.append(hist / (float(np.sum(hist)) + 1e-6)) return np.concatenate(parts) def _array_corr(a: np.ndarray, b: np.ndarray) -> float: if a.shape != b.shape: return 0.0 return float(np.mean(a * b)) def _hist_intersection(a: np.ndarray, b: np.ndarray) -> float: if a.shape != b.shape: return 0.0 return float(np.minimum(a, b).sum() / (np.maximum(a, b).sum() + 1e-6)) def _fixed_content_features(frame: np.ndarray, cfg: AppConfig) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: return ( _fixed_luma_feature(frame, cfg), _fixed_edge_feature(frame, cfg), _fixed_hist_feature(frame, cfg), _fixed_spatial_hist_feature(frame, cfg), ) def _fixed_content_pair_score( ref_features: tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray], source_frame: np.ndarray, cfg: AppConfig, ) -> float: src_luma, src_edge, src_hist, src_spatial = _fixed_content_features(source_frame, cfg) ref_luma, ref_edge, ref_hist, ref_spatial = ref_features luma_score = _array_corr(ref_luma, src_luma) edge_score = _array_corr(ref_edge, src_edge) hist_score = _hist_intersection(ref_hist, src_hist) spatial_score = _hist_intersection(ref_spatial, src_spatial) return ( edge_score * 0.24 + luma_score * 0.24 + hist_score * 0.14 + spatial_score * 0.38 ) def _prepare_validation_templates( beat: TrailerBeat, cfg: AppConfig, ) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: step_s = max(0.20, cfg.cv.deep_scan.content_align_sample_step_s * 1.5) matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] t = 0.0 while t <= matchable_s: frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) if frame is not None and _is_scoreable_reference_frame(frame, cfg): templates.append((t, _fixed_content_features(frame, cfg))) t = round(t + step_s, 6) if len(templates) >= 3: return templates fallback: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] for offset_s in _beat_offsets(matchable_s): frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) if frame is not None and _is_scoreable_reference_frame(frame, cfg): fallback.append((offset_s, _fixed_content_features(frame, cfg))) return fallback def _prepare_rerank_templates( beat: TrailerBeat, cfg: AppConfig, ) -> list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]]: matchable_s = estimate_matchable_reference_duration(beat, cfg) templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]] = [] for offset_s in _beat_offsets(matchable_s): frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) if frame is not None and _is_scoreable_reference_frame(frame, cfg): templates.append((offset_s, _fixed_content_features(frame, cfg))) return templates def _fixed_content_sequence_score( cap: cv2.VideoCapture, in_point_s: float, templates: list[tuple[float, tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]], cfg: AppConfig, ) -> float: if not templates: return 0.0 scores: list[float] = [] for offset_s, ref_features in templates: frame = grab_frame_at(cap, in_point_s + offset_s) if frame is None: return 0.0 scores.append(_fixed_content_pair_score(ref_features, frame, cfg)) if not scores: return 0.0 return float((sum(scores) / len(scores)) * 0.68 + min(scores) * 0.32) def _reference_internal_cut_offsets(beat: TrailerBeat, cfg: AppConfig) -> list[float]: """Detect hard visual cuts inside a single trailer beat.""" cache_key = ( str(beat.trailer_path), round(float(beat.start_s), 3), round(float(beat.end_s), 3), round(float(cfg.vision.multi_shot_cut_corr_threshold), 3), ) cached = _REFERENCE_CUT_CACHE.get(cache_key) if cached is not None: return cached step_s = max(1.0 / cfg.export.edl_frame_rate, 0.08) previous: np.ndarray | None = None cuts: list[float] = [] t = 0.0 while t <= beat.duration_s: frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) if frame is not None and _is_scoreable_reference_frame(frame, cfg): feature = _prepare_haystack(frame, cfg) if previous is not None: corr = _corr_same_size(previous, feature) if ( corr < cfg.vision.multi_shot_cut_corr_threshold and 0.18 < t < beat.duration_s - 0.18 and (not cuts or t - cuts[-1] > 0.24) ): cuts.append(round(t, 3)) previous = feature t = round(t + step_s, 6) if cuts: logger.debug('Beat %d: detected internal trailer cuts at %s', beat.beat_id, cuts) _REFERENCE_CUT_CACHE[cache_key] = cuts return cuts def _scene_fps_estimate(scene, cfg: AppConfig) -> float: duration_s = max(0.0, float(scene.end_s) - float(scene.start_s)) frame_count = max(0, int(scene.end_frame) - int(scene.start_frame)) if duration_s <= 0.0 or frame_count <= 0: return cfg.export.edl_frame_rate return frame_count / duration_s def _contiguous_scene_coverage_duration( beat: TrailerBeat, in_point_s: float, scenes: Sequence | None, matchable_duration_s: float, cfg: AppConfig, ) -> float: """ Allow a source span to cross scene boundaries only when the trailer beat has matching internal cuts at the same relative offsets. """ if not scenes or matchable_duration_s <= 0: return 0.0 start_idx = None for idx, scene in enumerate(scenes): if float(scene.start_s) <= in_point_s < float(scene.end_s): start_idx = idx break if start_idx is None: return 0.0 cut_offsets = _reference_internal_cut_offsets(beat, cfg) target_end = in_point_s + matchable_duration_s current_end = in_point_s for scene in scenes[start_idx:]: scene_end = float(scene.end_s) fps = _scene_fps_estimate(scene, cfg) tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) if target_end <= scene_end: return matchable_duration_s boundary_offset = scene_end - in_point_s boundary_matches_ref_cut = any( abs(boundary_offset - cut_offset) <= cfg.vision.multi_shot_boundary_tolerance_s for cut_offset in cut_offsets ) if not boundary_matches_ref_cut: return max(0.0, scene_end - in_point_s - tail_s) current_end = scene_end return max(0.0, current_end - in_point_s) def _rerank_candidates_by_content( beat: TrailerBeat, candidates: list[tuple[float, float]], cfg: AppConfig, scenes: Sequence | None = None, matchable_duration_s: float | None = None, ) -> list[tuple[float, float, float]]: templates = _prepare_rerank_templates(beat, cfg) if not templates: return [(score, score, t_sec) for score, t_sec in candidates] reranked: list[tuple[float, float, float]] = [] with open_video(cfg.paths.source_movie) as cap: for coarse_score, t_sec in candidates: content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) coverage_score = 1.0 if scenes is not None and matchable_duration_s and matchable_duration_s > 0: usable_s = _contiguous_scene_coverage_duration( beat, t_sec, scenes, matchable_duration_s, cfg, ) coverage_score = min(1.0, usable_s / matchable_duration_s) rank_score = ( content_score * 0.62 + coarse_score * 0.18 + coverage_score * 0.20 ) reranked.append((rank_score, coarse_score, t_sec)) return sorted(reranked, key=lambda item: item[0], reverse=True) def _dense_weighted_seed_candidates( beat: TrailerBeat, seed_candidates: list[tuple[float, float]], cfg: AppConfig, scenes: Sequence | None, matchable_duration_s: float, ) -> list[tuple[float, float]]: """Scan vision-selected source scenes densely with fixed-position content features.""" if not scenes or not seed_candidates: return [] weighted_floor = cfg.cv.deep_scan.coarse_candidate_threshold + 0.05 seeded_scenes: dict[int, tuple[object, float]] = {} for seed_score, seed_t in seed_candidates: if seed_score <= weighted_floor: continue scene = _find_scene_for_time(scenes, seed_t, cfg) if scene is None: continue previous = seeded_scenes.get(scene.scene_id) if previous is None or seed_score > previous[1]: seeded_scenes[scene.scene_id] = (scene, seed_score) if not seeded_scenes: return [] templates = _prepare_rerank_templates(beat, cfg) if not templates: return [] cut_offsets = _reference_internal_cut_offsets(beat, cfg) dense: list[tuple[float, float, float, float, int]] = [] with open_video(cfg.paths.source_movie) as cap: for scene, seed_score in seeded_scenes.values(): fps = _source_fps_from_scene(scene) or cfg.export.edl_frame_rate tail_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / fps) start_s = max(0.0, float(scene.start_s)) end_s = max(start_s, float(scene.end_s) - tail_s) if end_s <= start_s: continue span_s = end_s - start_s step_s = max(0.04, cfg.vision.local_scan_step_s) max_points = max(2, cfg.vision.local_scan_max_points_per_scene) point_count = int(span_s / step_s) + 1 if point_count > max_points: step_s = span_s / float(max_points - 1) t_sec = start_s while t_sec <= end_s + 0.001: content_score = _fixed_content_sequence_score(cap, t_sec, templates, cfg) usable_s = max(0.0, float(scene.end_s) - t_sec - tail_s) coverage_score = ( min(1.0, usable_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 ) rank_score = ( content_score * 0.50 + coverage_score * 0.35 + seed_score * 0.15 ) coarse_score = max( weighted_floor, min(0.99, seed_score * 0.80 + content_score * 0.20), ) dense.append((rank_score, coarse_score, t_sec, content_score, scene.scene_id)) t_sec += step_s for cut_offset in cut_offsets: shifted_t = max(0.0, float(scene.start_s) - cut_offset) coverage_score = ( min( 1.0, _contiguous_scene_coverage_duration( beat, shifted_t, scenes, matchable_duration_s, cfg, ) / matchable_duration_s, ) if matchable_duration_s > 0 else 0.0 ) if coverage_score < 0.80: continue content_score = _fixed_content_sequence_score(cap, shifted_t, templates, cfg) rank_score = ( content_score * 0.56 + coverage_score * 0.34 + seed_score * 0.10 ) coarse_score = max( weighted_floor, min(0.99, seed_score * 0.78 + content_score * 0.22), ) dense.append((rank_score, coarse_score, shifted_t, content_score, scene.scene_id)) dense.sort(key=lambda item: item[0], reverse=True) top = dense[: max(0, cfg.vision.local_scan_top_candidates)] if top: logger.info( 'Beat %d: dense vision content scan kept %d/%d candidates; best scene=%d in=%.3fs content=%.3f rank=%.3f.', beat.beat_id, len(top), len(dense), top[0][4], top[0][2], top[0][3], top[0][0], ) return [(coarse_score, t_sec) for _, coarse_score, t_sec, _, _ in top] def _beat_offsets(duration_s: float) -> list[float]: """Use several frames across the beat, including the leading edge.""" if duration_s < 1.0: return [0.0, duration_s * 0.35, duration_s * 0.70] if duration_s < 2.5: return [duration_s * r for r in (0.00, 0.15, 0.35, 0.55, 0.78)] return [duration_s * r for r in (0.00, 0.12, 0.30, 0.50, 0.70, 0.88)] def _prepare_beat_templates(beat: TrailerBeat, cfg: AppConfig) -> list[tuple[float, np.ndarray]]: templates: list[tuple[float, np.ndarray]] = [] matchable_s = estimate_matchable_reference_duration(beat, cfg) for offset_s in _beat_offsets(matchable_s): frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) if frame is None or not _is_scoreable_reference_frame(frame, cfg): continue templates.append((offset_s, _prepare_template(frame, cfg))) return templates def _prepare_beat_templates_stepped( beat: TrailerBeat, cfg: AppConfig, step_s: float = 0.12, ) -> list[tuple[float, np.ndarray]]: templates: list[tuple[float, np.ndarray]] = [] matchable_s = estimate_matchable_reference_duration(beat, cfg, sample_step_s=step_s) t = 0.0 while t <= matchable_s: frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) if frame is not None and _is_scoreable_reference_frame(frame, cfg): templates.append((t, _prepare_template(frame, cfg))) t = round(t + step_s, 6) return templates def _prepare_motion_templates( beat: TrailerBeat, cfg: AppConfig, step_s: float = 0.12, ) -> list[tuple[float, float, np.ndarray, tuple[int, ...]]]: """ Build reference frame-difference templates for motion-phase alignment. Absolute image similarity can match the right shot at the wrong point in a repeated movement. Frame-to-frame deltas make the refine pass care about the phase and direction of motion as well. """ result: list[tuple[float, float, np.ndarray, tuple[int, ...]]] = [] max_offset = max(0.0, beat.duration_s - step_s) t = 0.0 while t <= max_offset: f0 = grab_frame_at_path(beat.trailer_path, beat.start_s + t) f1 = grab_frame_at_path(beat.trailer_path, beat.start_s + t + step_s) if ( f0 is not None and f1 is not None and _is_scoreable_reference_frame(f0, cfg) and _is_scoreable_reference_frame(f1, cfg) ): feat0 = _prepare_template(f0, cfg) feat1 = _prepare_template(f1, cfg) result.append((t, step_s, cv2.absdiff(feat1, feat0), feat0.shape)) t = round(t + step_s, 6) return result def _is_dark_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: """Truly dark / pure-black frame: no usable structure for matching. A cross-fade silhouette (low overall luma but visible contrast) is NOT a dark frame for our purposes — it carries content (a hand, a knife, a face peeking through the fade) and should still be matchable. """ cropped = text_safe_crop( frame, cfg.cv.vibe_check.crop_top_fraction, cfg.cv.vibe_check.crop_bottom_fraction, ) gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) mean = float(np.mean(gray)) p90 = float(np.percentile(gray, 90)) p10 = float(np.percentile(gray, 10)) contrast = p90 - p10 # Real darkness: low luma AND low contrast (no structure visible) return mean < 28.0 and p90 < 58.0 and contrast < 30.0 def _reference_visibility_stats(frame: np.ndarray, cfg: AppConfig) -> tuple[float, float, float]: cropped = text_safe_crop( frame, cfg.cv.vibe_check.crop_top_fraction, cfg.cv.vibe_check.crop_bottom_fraction, ) gray = cv2.cvtColor(cropped, cv2.COLOR_BGR2GRAY) p10 = float(np.percentile(gray, 10)) p90 = float(np.percentile(gray, 90)) return float(np.mean(gray)), p90, p90 - p10 def _is_scoreable_reference_frame(frame: np.ndarray, cfg: AppConfig) -> bool: """Decide whether a reference frame can carry a usable match template. Two acceptance paths: * Standard: regular daylight / interior shot — luma at or above the configured thresholds AND enough contrast to be distinct. * Fade-content: low overall luma BUT with strong local contrast, i.e. a cross-fade silhouette where you can clearly see structure (hand+knife against dark, face emerging from black, etc.). Without this path the matcher would silently drop content-bearing fades and mis-match the visible portion alone. """ if _is_dark_reference_frame(frame, cfg): return False mean_luma, p90_luma, contrast = _reference_visibility_stats(frame, cfg) # Standard daylight / interior shot enough_luma = ( mean_luma >= cfg.cv.deep_scan.scoreable_luma_mean_min or p90_luma >= cfg.cv.deep_scan.scoreable_luma_p90_min ) if enough_luma and contrast >= cfg.cv.deep_scan.scoreable_contrast_min: return True # Fade-content: dim but with structure. The local contrast must be # well above what a uniform dim frame would have, and at least a few # bright pixels must exist (p90 above pure-black), so we don't accept # a featureless dark wash. These thresholds are deliberately tighter # than the standard path so we don't pollute scoring with smooth fades. if contrast >= 40.0 and p90_luma >= 30.0: return True return False def estimate_matchable_reference_duration( beat: TrailerBeat, cfg: AppConfig, sample_step_s: float | None = None, ) -> float: """ Estimate the part of a trailer beat that should be source-matchable. Trailer beats often include trailing black/title/credit frames that do not exist in the source movie. Those frames should not force the source match to cover the full beat duration. """ step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s samples: list[tuple[float, bool]] = [] t = 0.0 while t <= beat.duration_s: frame = grab_frame_at_path(beat.trailer_path, beat.start_s + t) if frame is not None: samples.append((t, _is_dark_reference_frame(frame, cfg))) t = round(t + step_s, 6) if not samples: return beat.duration_s dark_run_start: float | None = None saw_visible = False min_dark_break_s = max(0.24, step_s * 2.0) for offset_s, is_dark in samples: if not is_dark: saw_visible = True dark_run_start = None continue if saw_visible: if dark_run_start is None: dark_run_start = offset_s if offset_s - dark_run_start >= min_dark_break_s: break if dark_run_start is None: return beat.duration_s # Keep a small buffer before the first sustained dark/title break so the # source clip does not visibly end before the trailer begins its fade/card. # Long beats can contain later credit/title islands; those should not force # one source clip to validate unrelated images. return max(step_s, min(beat.duration_s, dark_run_start + step_s)) def _sequence_score( cap: cv2.VideoCapture, in_point_s: float, templates: list[tuple[float, np.ndarray]], cfg: AppConfig, ) -> float: weighted_scores: list[float] = [] raw_scores: list[float] = [] for offset_s, template in templates: frame = grab_frame_at(cap, in_point_s + offset_s) if frame is None: return -1.0 floating_score = _match_score(frame, template, cfg) fixed_score = _fixed_position_score(frame, template, cfg) score = (floating_score * 0.55) + (fixed_score * 0.45) # The first frames matter most for perceived sync. Weight them higher # so a match that begins a few frames early loses to a better aligned hit. weight = 1.35 if offset_s <= 0.16 else 1.0 weighted_scores.append(score * weight) raw_scores.append(score) if not raw_scores: return -1.0 # Reward consistently good temporal alignment. A single strong frame is not # enough if the other beat frames drift away. weighted_avg = sum(weighted_scores) / (len(raw_scores) + 0.35 * sum(1 for o, _ in templates if o <= 0.16)) return float(weighted_avg * 0.70 + min(raw_scores) * 0.30) def _content_alignment_templates( beat: TrailerBeat, cfg: AppConfig, ) -> list[tuple[float, np.ndarray]]: matchable_s = estimate_matchable_reference_duration( beat, cfg, sample_step_s=cfg.cv.deep_scan.content_align_sample_step_s, ) step_s = max(1.0 / cfg.export.edl_frame_rate, cfg.cv.deep_scan.content_align_sample_step_s) max_offset_s = max(0.0, min(beat.duration_s, matchable_s) - step_s) offsets = [0.0] t = step_s while t <= max_offset_s: offsets.append(round(t, 6)) t = round(t + step_s, 6) if matchable_s > step_s and offsets[-1] < max_offset_s: offsets.append(round(max_offset_s, 6)) templates: list[tuple[float, np.ndarray]] = [] for offset_s in offsets: frame = grab_frame_at_path(beat.trailer_path, beat.start_s + offset_s) if frame is not None: if not _is_scoreable_reference_frame(frame, cfg): continue templates.append((offset_s, _prepare_template(frame, cfg))) if not templates: return _prepare_beat_templates(beat, cfg) return templates def _content_alignment_score( cap: cv2.VideoCapture, in_point_s: float, templates: list[tuple[float, np.ndarray]], cfg: AppConfig, ) -> float: if not templates: return -1.0 weighted_total = 0.0 weight_total = 0.0 raw_scores: list[float] = [] early_scores: list[float] = [] for offset_s, template in templates: frame = grab_frame_at(cap, in_point_s + offset_s) if frame is None: return -1.0 # For offset detection the fixed frame position is intentionally more # important than free template placement. Free placement can make the # right shot look acceptable even when the movement is a few frames off. fixed_score = _fixed_position_score(frame, template, cfg) floating_score = _match_score(frame, template, cfg) score = fixed_score * 0.72 + floating_score * 0.28 weight = 1.45 if offset_s <= 0.20 else 1.0 weighted_total += score * weight weight_total += weight raw_scores.append(score) if offset_s <= 0.36: early_scores.append(score) avg_score = weighted_total / weight_total if weight_total > 0 else -1.0 min_score = min(raw_scores) if raw_scores else -1.0 early_score = sum(early_scores) / len(early_scores) if early_scores else avg_score return float(avg_score * 0.55 + min_score * 0.25 + early_score * 0.20) def align_in_point_by_content( beat: TrailerBeat, estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, ) -> tuple[float, float]: """ Find the frame offset directly from image content around a rough match. This is deliberately local: once a candidate shot is plausible, scanning a small window around it with many reference frames is faster and more robust than repeating a global scan or applying a fixed frame preroll. """ templates = _content_alignment_templates(beat, cfg) if not templates: return estimated_in_point_s, 0.0 with open_video(cfg.paths.source_movie) as cap: fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate frame_step_s = 1.0 / fps window_s = ( search_window_s if search_window_s is not None else cfg.cv.deep_scan.content_align_window_seconds ) start_s = max(0.0, estimated_in_point_s - window_s) end_s = estimated_in_point_s + window_s tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta best_in = estimated_in_point_s best_score = -1.0 t = start_s while t <= end_s: score = _content_alignment_score(cap, t, templates, cfg) if score > best_score + tie_delta: best_score = score best_in = t elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s): best_in = t t = round(t + frame_step_s, 6) return best_in, max(0.0, best_score) def _motion_phase_score( cap: cv2.VideoCapture, in_point_s: float, motion_templates: list[tuple[float, float, np.ndarray, tuple[int, ...]]], cfg: AppConfig, ) -> float: scores: list[float] = [] for offset_s, step_s, ref_delta, template_shape in motion_templates: f0 = grab_frame_at(cap, in_point_s + offset_s) f1 = grab_frame_at(cap, in_point_s + offset_s + step_s) if f0 is None or f1 is None: return -1.0 src0 = _fixed_feature(f0, template_shape, cfg) src1 = _fixed_feature(f1, template_shape, cfg) scores.append(_corr_same_size(cv2.absdiff(src1, src0), ref_delta)) if not scores: return 0.0 return float((sum(scores) / len(scores)) * 0.65 + min(scores) * 0.35) def align_in_point_by_motion( beat: TrailerBeat, estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, ) -> tuple[float, float]: """ Align a candidate by matching the frame-to-frame motion pattern. This catches the common failure mode where the right source scene is found, but the in-point is a few seconds too early or late inside a repeated conversation/action beat. """ motion_templates = _prepare_motion_templates(beat, cfg) if len(motion_templates) < 2: return estimated_in_point_s, 0.0 with open_video(cfg.paths.source_movie) as cap: fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate frame_step_s = 1.0 / fps window_s = ( search_window_s if search_window_s is not None else cfg.cv.deep_scan.content_align_window_seconds ) start_s = max(0.0, estimated_in_point_s - window_s) end_s = estimated_in_point_s + window_s tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta best_in = estimated_in_point_s best_score = -1.0 t = start_s while t <= end_s: score = _motion_phase_score(cap, t, motion_templates, cfg) if score > best_score + tie_delta: best_score = score best_in = t elif score >= best_score - tie_delta and abs(t - estimated_in_point_s) < abs(best_in - estimated_in_point_s): best_in = t t = round(t + frame_step_s, 6) return best_in, max(0.0, best_score) def align_in_point_by_content_and_motion( beat: TrailerBeat, estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, ) -> tuple[float, float, float, float]: """ Align a candidate using still-frame content and motion phase together. Running content and motion as separate passes can overshoot short action phases: one pass may land on the right broad gesture and the next can slide to a visually similar but later posture. A joint score keeps the in-point tied to the same frame hypothesis throughout the local search. """ templates = _prepare_beat_templates(beat, cfg) motion_templates = _prepare_motion_templates(beat, cfg) if not templates: return estimated_in_point_s, 0.0, 0.0, 0.0 with open_video(cfg.paths.source_movie) as cap: fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate frame_step_s = 1.0 / fps window_s = ( search_window_s if search_window_s is not None else cfg.cv.deep_scan.content_align_window_seconds ) start_s = max(0.0, estimated_in_point_s - window_s) end_s = estimated_in_point_s + window_s tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta best_in = estimated_in_point_s best_score = -1.0 best_content = -1.0 best_motion = -1.0 t = start_s while t <= end_s: content_score = _content_alignment_score(cap, t, templates, cfg) motion_score = ( _motion_phase_score(cap, t, motion_templates, cfg) if len(motion_templates) >= 2 else content_score ) if content_score < 0 or motion_score < 0: t = round(t + frame_step_s, 6) continue raw_score = content_score * 0.64 + motion_score * 0.36 anchor_penalty = min(0.18, abs(t - estimated_in_point_s) * 0.05) score = raw_score - anchor_penalty if score > best_score + tie_delta: best_score = score best_in = t best_content = content_score best_motion = motion_score elif score >= best_score - tie_delta: current_distance = abs(t - estimated_in_point_s) best_distance = abs(best_in - estimated_in_point_s) if current_distance < best_distance or ( abs(current_distance - best_distance) <= frame_step_s * 0.5 and t < best_in ): best_in = t best_content = content_score best_motion = motion_score t = round(t + frame_step_s, 6) return best_in, max(0.0, best_score), max(0.0, best_content), max(0.0, best_motion) def estimate_usable_source_duration( beat: TrailerBeat, in_point_s: float, cfg: AppConfig, sample_step_s: float | None = None, min_keep_s: float = 0.5, ) -> tuple[float, float]: """ Estimate how long the source stays visually aligned with the beat. This catches cases where the source dissolves/cuts into the next shot while the trailer beat continues into a title card or black fade. Returns: (usable_duration_s, average_good_score) """ step_s = sample_step_s if sample_step_s is not None else cfg.cv.deep_scan.span_sample_step_s templates = _prepare_beat_templates_stepped(beat, cfg, step_s) if not templates: return beat.duration_s, 0.0 scores: list[tuple[float, float]] = [] source_fps = cfg.export.edl_frame_rate with open_video(cfg.paths.source_movie) as cap: source_fps = float(cap.get(cv2.CAP_PROP_FPS)) or cfg.export.edl_frame_rate for offset_s, template in templates: frame = grab_frame_at(cap, in_point_s + offset_s) if frame is None: break scores.append((offset_s, _match_score(frame, template, cfg))) if not scores: return 0.0, 0.0 warmup_scores = [score for offset, score in scores if offset <= min(1.0, beat.duration_s * 0.35)] baseline = max(warmup_scores) if warmup_scores else max(score for _, score in scores) min_score = max(0.34, baseline * 0.48) last_good = 0.0 bad_run = 0 good_scores: list[float] = [] for offset_s, score in scores: if score >= min_score: last_good = offset_s bad_run = 0 good_scores.append(score) continue if offset_s < min_keep_s: continue bad_run += 1 if bad_run >= 3: break tail_safety_s = max(0.0, cfg.cv.deep_scan.trim_tail_frames / source_fps) usable = min(beat.duration_s, max(0.0, last_good - tail_safety_s)) if usable < min_keep_s and scores: usable = min(beat.duration_s, max(min_keep_s, scores[0][0] + step_s - tail_safety_s)) avg_good = float(sum(good_scores) / len(good_scores)) if good_scores else 0.0 return usable, avg_good def refine_timestamp(template: np.ndarray, t_sec: float, cfg: AppConfig) -> float: best_score = -1.0 best_t = t_sec tie_delta = cfg.cv.deep_scan.start_tie_break_score_delta with open_video(cfg.paths.source_movie) as cap: fps = float(cap.get(cv2.CAP_PROP_FPS)) step = 1.0 / fps start_t = max(0.0, t_sec - 0.5) end_t = t_sec + 0.5 t = start_t while t <= end_t: frame = grab_frame_at(cap, t) if frame is not None: max_val = _match_score(frame, template, cfg) if max_val > best_score + tie_delta: best_score = max_val best_t = t elif max_val >= best_score - tie_delta and t < best_t: best_t = t t += step return best_t def refine_in_point_with_sequence( beat: TrailerBeat, estimated_in_point_s: float, cfg: AppConfig, search_window_s: float | None = None, ) -> tuple[float, float]: """ Refine a rough source in-point by comparing several frames across the beat. Returns: (best_in_point_s, sequence_score) """ return align_in_point_by_content(beat, estimated_in_point_s, cfg, search_window_s) def _find_scene_for_time(scenes: Sequence | None, t_sec: float, cfg: AppConfig): if not scenes: return None for idx, scene in enumerate(scenes): if scene.start_s <= t_sec < scene.end_s: if ( scene.end_s - t_sec <= cfg.cv.deep_scan.scene_boundary_epsilon_s and idx + 1 < len(scenes) ): return scenes[idx + 1] return scene return None def _source_fps_from_scene(scene) -> float: duration_s = max(0.0, scene.end_s - scene.start_s) frame_count = max(0, scene.end_frame - scene.start_frame) return frame_count / duration_s if duration_s > 0 and frame_count > 0 else 0.0 def _apply_start_preroll(in_point_s: float, source_fps: float, cfg: AppConfig) -> float: if cfg.cv.deep_scan.start_preroll_frames <= 0: return in_point_s fps = source_fps or cfg.export.edl_frame_rate return max(0.0, in_point_s - (cfg.cv.deep_scan.start_preroll_frames / fps)) def _clamp_to_scene_start(in_point_s: float, scene) -> float: if scene is None: return in_point_s return max(float(scene.start_s), in_point_s) def _add_top_candidate( candidates: list[tuple[float, float]], score: float, t_sec: float, max_candidates: int, min_distance_s: float, ) -> list[tuple[float, float]]: """ Keep diverse coarse candidates as (score, midpoint_time). A single best midpoint frame is too brittle: repeated actors, similar color palettes, cars, forests, and title-card darkness can all create plausible false positives. Keeping a ranked pool lets the multi-frame sequence pass choose the temporally consistent match. """ for idx, (old_score, old_t) in enumerate(candidates): if abs(old_t - t_sec) < min_distance_s: if score > old_score: candidates[idx] = (score, t_sec) return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] candidates.append((score, t_sec)) return sorted(candidates, key=lambda item: item[0], reverse=True)[:max_candidates] def run_global_scan( beats: Sequence[TrailerBeat], cfg: AppConfig, scenes: Sequence | None = None, seed_in_points: dict[int, Sequence[SeedPoint]] | None = None, ) -> list[MatchResult]: logger.info('[Global Scan] Preparing templates for %d beats...', len(beats)) templates = [] midpoint_templates = [] beat_valid = [] for b in beats: bf = grab_frame_at_path(cfg.paths.reference_trailer, b.start_s + (b.end_s - b.start_s)/2) if bf is None: midpoint_templates.append(None) templates.append([]) beat_valid.append(False) continue midpoint_templates.append(_prepare_template(bf, cfg)) beat_templates = _prepare_beat_templates(b, cfg) templates.append(beat_templates) beat_valid.append(bool(beat_templates)) top_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] seed_candidates: list[list[tuple[float, float]]] = [[] for _ in beats] has_weighted_seeds = False for idx, beat in enumerate(beats): for seed in (seed_in_points or {}).get(beat.beat_id, ()): if isinstance(seed, tuple): seed_t = float(seed[0]) seed_score = max( cfg.cv.deep_scan.coarse_candidate_threshold, min(0.99, float(seed[1])), ) has_weighted_seeds = True else: seed_t = float(seed) seed_score = cfg.cv.deep_scan.coarse_candidate_threshold seed_candidate = ( seed_score, max(0.0, seed_t), ) seed_candidates[idx].append(seed_candidate) top_candidates[idx] = _add_top_candidate( top_candidates[idx], seed_candidate[0], seed_candidate[1], max_candidates=cfg.cv.deep_scan.sequence_candidate_count, min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, ) if (seed_in_points or {}).get(beat.beat_id): logger.info( 'Beat %d: added %d seeded in-point candidates.', beat.beat_id, len((seed_in_points or {}).get(beat.beat_id, ())), ) skip_coarse_scan = ( cfg.vision.enabled and cfg.cv.deep_scan.skip_coarse_scan_with_weighted_seeds and has_weighted_seeds and all(top_candidates[i] for i, valid in enumerate(beat_valid) if valid) ) if skip_coarse_scan: logger.info('[Global Scan] Weighted vision seeds present; skipping full FFmpeg coarse scan.') else: fps = 2.0 cmd = [ 'ffmpeg', '-i', str(cfg.paths.source_movie), '-vf', f'scale={cfg.video.proxy_width}:{cfg.video.proxy_height},fps={fps}', '-f', 'image2pipe', '-vcodec', 'rawvideo', '-pix_fmt', 'bgr24', '-' ] logger.info('[Global Scan] Streaming %s via FFmpeg (%.1f fps) ...', cfg.paths.source_movie.name, fps) p = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.DEVNULL) frame_size = cfg.video.proxy_width * cfg.video.proxy_height * 3 frame_idx = 0 start_t = time.time() while True: raw = p.stdout.read(frame_size) if len(raw) != frame_size: break frame = np.frombuffer(raw, dtype=np.uint8).reshape((cfg.video.proxy_height, cfg.video.proxy_width, 3)) haystack = _prepare_haystack(frame, cfg) for i, beat_templates in enumerate(templates): if not beat_valid[i]: continue source_t = frame_idx / fps for beat_offset_s, template in beat_templates: res = cv2.matchTemplate(haystack, template, cv2.TM_CCOEFF_NORMED) _, max_val, _, _ = cv2.minMaxLoc(res) candidate_in_s = source_t - beat_offset_s if candidate_in_s < 0.0: continue top_candidates[i] = _add_top_candidate( top_candidates[i], float(max_val), candidate_in_s, max_candidates=cfg.cv.deep_scan.sequence_candidate_count, min_distance_s=cfg.cv.deep_scan.sequence_min_distance_s, ) frame_idx += 1 if frame_idx % 1000 == 0: logger.info('[Global Scan] Processed %d frames (%.1fs movie time)...', frame_idx, frame_idx / fps) p.stdout.close() p.wait() logger.info('[Global Scan] Finished streaming %d frames in %.1fs.', frame_idx, time.time() - start_t) results = [] source_info = get_video_info(cfg.paths.source_movie) source_fps = float(source_info['fps']) or 24.0 for i, b in enumerate(beats): if not beat_valid[i]: continue candidates = top_candidates[i] if not candidates: continue score = float(candidates[0][0]) if score >= cfg.cv.deep_scan.coarse_candidate_threshold: matchable_duration_s = estimate_matchable_reference_duration(b, cfg) logger.info( 'Beat %d: refining %d temporal candidates (best offset score %.3f, matchable %.2fs / beat %.2fs).', b.beat_id, len(candidates), score, matchable_duration_s, b.duration_s, ) best_result: MatchResult | None = None best_short_result: MatchResult | None = None best_short_coverage = -1.0 best_duration_coverage = -1.0 best_content_score = -1.0 rejected_short_candidates = 0 rejected_content_candidates = 0 scan_cfg = cfg.cv.deep_scan content_gate = ( min(scan_cfg.provisional_content_threshold, cfg.vision.content_threshold) if skip_coarse_scan and has_weighted_seeds else scan_cfg.provisional_content_threshold ) candidate_pool = candidates[:scan_cfg.content_rerank_candidate_count] for seed_candidate in seed_candidates[i]: candidate_pool = _add_top_candidate( candidate_pool, seed_candidate[0], seed_candidate[1], max_candidates=scan_cfg.content_rerank_candidate_count + len(seed_candidates[i]), min_distance_s=scan_cfg.sequence_min_distance_s, ) if skip_coarse_scan and has_weighted_seeds: dense_candidates = _dense_weighted_seed_candidates( b, seed_candidates[i], cfg, scenes, matchable_duration_s, ) for dense_candidate in dense_candidates: candidate_pool = _add_top_candidate( candidate_pool, dense_candidate[0], dense_candidate[1], max_candidates=( scan_cfg.content_rerank_candidate_count + len(seed_candidates[i]) + len(dense_candidates) ), min_distance_s=max(0.04, cfg.vision.local_scan_step_s * 0.5), ) reranked_candidates = _rerank_candidates_by_content( b, candidate_pool, cfg, scenes=scenes, matchable_duration_s=matchable_duration_s, ) refine_limit = ( min(scan_cfg.max_refine_candidates, cfg.vision.max_refine_candidates) if skip_coarse_scan and has_weighted_seeds else scan_cfg.max_refine_candidates ) refine_candidates = [ (coarse_score, in_point_s) for _, coarse_score, in_point_s in reranked_candidates[:refine_limit] ] validation_templates = _prepare_validation_templates(b, cfg) motion_templates = _prepare_motion_templates(b, cfg) logger.info( 'Beat %d: content-reranked top %d / %d candidates.', b.beat_id, len(refine_candidates), len(candidate_pool), ) for coarse_score, coarse_in_s in refine_candidates: rough_in_s = coarse_in_s is_weighted_seed_candidate = ( skip_coarse_scan and has_weighted_seeds and coarse_score > scan_cfg.coarse_candidate_threshold + 0.05 ) if midpoint_templates[i] is not None and not is_weighted_seed_candidate: midpoint_t = coarse_in_s + (b.duration_s / 2) fine_t = refine_timestamp(midpoint_templates[i], midpoint_t, cfg) rough_in_s = max(0.0, fine_t - (b.duration_s / 2)) local_align_window_s = ( min(cfg.vision.local_scan_step_s, cfg.cv.deep_scan.content_align_window_seconds) if is_weighted_seed_candidate else None ) refined_in_s, sequence_score = refine_in_point_with_sequence( b, rough_in_s, cfg, search_window_s=local_align_window_s, ) scene = _find_scene_for_time(scenes, refined_in_s, cfg) scene_fps = _source_fps_from_scene(scene) if scene is not None else source_fps adjusted_in_s = _apply_start_preroll(refined_in_s, scene_fps, cfg) adjusted_in_s = _clamp_to_scene_start(adjusted_in_s, scene) scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) usable_duration_s, span_score = estimate_usable_source_duration(b, adjusted_in_s, cfg) out_s = adjusted_in_s + usable_duration_s if scene is not None: out_s = min(out_s, scene.end_s) duration_s = max(0.0, out_s - adjusted_in_s) duration_coverage = min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 with open_video(cfg.paths.source_movie) as validation_cap: original_content_score = _fixed_content_sequence_score( validation_cap, adjusted_in_s, validation_templates, cfg, ) content_score = original_content_score content_in_s, align_content_score = align_in_point_by_content( b, adjusted_in_s, cfg, search_window_s=( local_align_window_s if local_align_window_s is not None else min(0.8, cfg.cv.deep_scan.content_align_window_seconds) ), ) if abs(content_in_s - adjusted_in_s) <= cfg.cv.deep_scan.content_align_window_seconds: with open_video(cfg.paths.source_movie) as validation_cap: aligned_content_score = _fixed_content_sequence_score( validation_cap, content_in_s, validation_templates, cfg, ) if aligned_content_score >= original_content_score + 0.01: adjusted_in_s = content_in_s content_score = min(align_content_score, aligned_content_score) scene = _find_scene_for_time(scenes, adjusted_in_s, cfg) usable_duration_s = max(0.0, duration_s) out_s = adjusted_in_s + usable_duration_s if scene is not None: out_s = min(out_s, scene.end_s) duration_s = max(0.0, out_s - adjusted_in_s) duration_coverage = ( min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 ) motion_score = 0.0 if len(motion_templates) >= 2: with open_video(cfg.paths.source_movie) as motion_cap: motion_score = _motion_phase_score( motion_cap, adjusted_in_s, motion_templates, cfg, ) if is_weighted_seed_candidate and scene is not None and content_score >= content_gate: contiguous_usable_s = _contiguous_scene_coverage_duration( b, adjusted_in_s, scenes, matchable_duration_s, cfg, ) scene_duration_s = min(b.duration_s, contiguous_usable_s) if scene_duration_s > duration_s: usable_duration_s = scene_duration_s out_s = adjusted_in_s + usable_duration_s duration_s = usable_duration_s duration_coverage = ( min(1.0, duration_s / matchable_duration_s) if matchable_duration_s > 0 else 0.0 ) span_score = max(span_score, content_score) final_score = ( sequence_score * scan_cfg.sequence_score_weight + span_score * scan_cfg.span_score_weight + coarse_score * scan_cfg.coarse_score_weight + duration_coverage * scan_cfg.duration_score_weight ) final_score = ( final_score * (1.0 - scan_cfg.content_validation_weight) + content_score * scan_cfg.content_validation_weight ) if len(motion_templates) >= 2: motion_score_clamped = max(0.0, min(1.0, motion_score)) final_score = final_score * 0.82 + motion_score_clamped * 0.18 if is_weighted_seed_candidate: vision_provisional_score = ( content_score * 0.45 + duration_coverage * 0.33 + coarse_score * 0.12 + max(0.0, min(1.0, motion_score)) * 0.10 ) final_score = max(final_score, vision_provisional_score) if content_score < scan_cfg.match_threshold and not is_weighted_seed_candidate: final_score = min(final_score, content_score) if content_score < content_gate: logger.debug( 'Beat %d rejected by content validation in=%.3fs scene=%s content=%.3f min=%.3f', b.beat_id, adjusted_in_s, scene.scene_id if scene is not None else 'none', content_score, content_gate, ) rejected_content_candidates += 1 continue candidate_result = MatchResult( beat_id=b.beat_id, scene_id=scene.scene_id if scene is not None else 0, source_path=cfg.paths.source_movie, in_point_s=max(0.0, adjusted_in_s), out_point_s=out_s, in_point_frame=int(max(0.0, adjusted_in_s) * source_fps), match_score=final_score, ) if duration_coverage < scan_cfg.min_duration_coverage: rejected_short_candidates += 1 logger.debug( 'Beat %d short candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f', b.beat_id, adjusted_in_s, scene.scene_id if scene is not None else 'none', sequence_score, span_score, coarse_score, content_score, motion_score, duration_coverage, final_score, ) long_enough_for_review = duration_s >= max(0.5, matchable_duration_s * 0.45) visually_plausible = ( sequence_score >= scan_cfg.provisional_match_threshold or final_score >= scan_cfg.provisional_match_threshold ) if long_enough_for_review and visually_plausible: if ( best_short_result is None or candidate_result.match_score > best_short_result.match_score + scan_cfg.duration_tie_break_score_delta or ( candidate_result.match_score >= best_short_result.match_score - scan_cfg.duration_tie_break_score_delta and duration_coverage > best_short_coverage ) ): best_short_result = candidate_result best_short_coverage = duration_coverage continue logger.debug( 'Beat %d candidate in=%.3fs scene=%s sequence=%.3f span=%.3f coarse=%.3f content=%.3f motion=%.3f coverage=%.2f final=%.3f', b.beat_id, adjusted_in_s, scene.scene_id if scene is not None else 'none', sequence_score, span_score, coarse_score, content_score, motion_score, duration_coverage, final_score, ) clearly_better_score = ( best_result is None or candidate_result.match_score > best_result.match_score + scan_cfg.duration_tie_break_score_delta ) similar_score_better_duration = ( best_result is not None and candidate_result.match_score >= best_result.match_score - scan_cfg.duration_tie_break_score_delta and duration_coverage > best_duration_coverage + 0.03 ) similar_vision_score_earlier_phase = ( is_weighted_seed_candidate and best_result is not None and candidate_result.scene_id == best_result.scene_id and candidate_result.match_score >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta and content_score >= best_content_score - 0.005 and duration_coverage >= best_duration_coverage - 0.03 and candidate_result.in_point_s < best_result.in_point_s ) similar_vision_score_better_phase = ( is_weighted_seed_candidate and best_result is not None and candidate_result.scene_id == best_result.scene_id and candidate_result.match_score >= best_result.match_score - cfg.vision.local_scan_tie_break_score_delta and content_score > best_content_score + 0.008 and duration_coverage >= best_duration_coverage - 0.03 ) if ( clearly_better_score or similar_score_better_duration or similar_vision_score_earlier_phase or similar_vision_score_better_phase ): best_result = candidate_result best_duration_coverage = duration_coverage best_content_score = content_score if best_result is None: if best_short_result is not None: logger.warning( 'Beat %d: using short provisional automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', b.beat_id, best_short_result.scene_id, best_short_result.in_point_s, best_short_result.duration_s, best_short_coverage, best_short_result.match_score, ) best_result = best_short_result best_duration_coverage = best_short_coverage else: if rejected_content_candidates > 0 and rejected_short_candidates == 0: logger.warning( 'Beat %d: NO MATCH after refinement (%d candidates rejected by content validation)', b.beat_id, rejected_content_candidates, ) else: logger.warning( 'Beat %d: NO MATCH after refinement (%d candidates rejected below %.0f%% duration coverage, %d by content validation)', b.beat_id, rejected_short_candidates, scan_cfg.min_duration_coverage * 100.0, rejected_content_candidates, ) continue is_confirmed = best_result.match_score >= cfg.cv.deep_scan.match_threshold if best_result.match_score < cfg.cv.deep_scan.provisional_match_threshold: logger.warning( 'Beat %d: NO MATCH after refinement (best final score %.3f, provisional threshold %.3f)', b.beat_id, best_result.match_score, cfg.cv.deep_scan.provisional_match_threshold, ) continue if not is_confirmed: logger.warning( 'Beat %d: provisional automatic match scene=%d in=%.3fs score=%.3f (confirmed threshold %.3f)', b.beat_id, best_result.scene_id, best_result.in_point_s, best_result.match_score, cfg.cv.deep_scan.match_threshold, ) logger.info( 'Beat %d: best automatic match scene=%d in=%.3fs dur=%.3fs coverage=%.2f score=%.3f', b.beat_id, best_result.scene_id, best_result.in_point_s, best_result.duration_s, best_duration_coverage, best_result.match_score, ) results.append(MatchResult( beat_id=b.beat_id, scene_id=best_result.scene_id, source_path=cfg.paths.source_movie, in_point_s=best_result.in_point_s, out_point_s=best_result.out_point_s, in_point_frame=best_result.in_point_frame, match_score=best_result.match_score, is_confirmed=is_confirmed, )) else: logger.warning( 'Beat %d: NO MATCH (best coarse score %.3f, coarse threshold %.3f)', b.beat_id, score, cfg.cv.deep_scan.coarse_candidate_threshold, ) if skip_coarse_scan and not results and cfg.vision.fullscan_fallback: logger.warning( '[Global Scan] Weighted vision-seed pass found no valid matches; retrying with full FFmpeg coarse scan.' ) retry_cfg = replace( cfg, cv=replace( cfg.cv, deep_scan=replace(cfg.cv.deep_scan, skip_coarse_scan_with_weighted_seeds=False), ), ) return run_global_scan(beats, retry_cfg, scenes=scenes, seed_in_points=seed_in_points) return results